diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index df5b2304bc3..37dbcd2ebb0 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -25,8 +25,11 @@ assignees: '' -#### Output of ``xr.show_versions()`` -
+#### Versions + +
Output of `xr.show_versions()` + +
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a921bddaa23..c30202ac046 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,6 @@ - - [ ] Closes #xxxx + - [ ] Fixes #xxxx - [ ] Tests added - [ ] Passes `isort -rc . && black . && mypy . && flake8` - [ ] Fully documented, including `whats-new.rst` for all changes and `api.rst` for new API diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index 4ef7342a5ed..3fdd1d7236d 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -23,7 +23,7 @@ Time required: about an hour. 4. Check that the ReadTheDocs build is passing. 5. On the master branch, commit the release in git: ``` - git commit -a -m 'Release v0.X.Y' + git commit -am 'Release v0.X.Y' ``` 6. Tag the release: ``` @@ -60,10 +60,35 @@ Time required: about an hour. It's OK to force push to 'stable' if necessary. (We also update the stable branch with `git cherrypick` for documentation only fixes that apply the current released version.) -12. Add a section for the next release (v.X.(Y+1)) to doc/whats-new.rst. +12. Add a section for the next release (v.X.Y+1) to doc/whats-new.rst: + ``` + .. _whats-new.0.X.Y+1: + + v0.X.Y+1 (unreleased) + --------------------- + + Breaking changes + ~~~~~~~~~~~~~~~~ + + + New Features + ~~~~~~~~~~~~ + + + Bug fixes + ~~~~~~~~~ + + + Documentation + ~~~~~~~~~~~~~ + + + Internal Changes + ~~~~~~~~~~~~~~~~ + ``` 13. Commit your changes and push to master again: ``` - git commit -a -m 'New whatsnew section' + git commit -am 'New whatsnew section' git push upstream master ``` You're done pushing to master! @@ -88,15 +113,17 @@ Time required: about an hour. ``` git log "$(git tag --sort="v:refname" | sed -n 'x;$p').." --format="%aN" | sort -u ``` - or by replacing `v0.X.Y` with the _previous_ release in: + or by substituting the _previous_ release in: ``` - git log v0.X.Y.. --format="%aN" | sort -u + git log v0.X.Y-1.. --format="%aN" | sort -u ``` + NB: copying this output into a Google Groups form can cause + [issues](https://groups.google.com/forum/#!topic/xarray/hK158wAviPs) with line breaks, so take care Note on version numbering: We follow a rough approximation of semantic version. Only major releases (0.X.0) -show include breaking changes. Minor releases (0.X.Y) are for bug fixes and +should include breaking changes. Minor releases (0.X.Y) are for bug fixes and backwards compatible new features, but if a sufficient number of new features have arrived we will issue a major release even if there are no compatibility breaks. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5789161c966..8d43de7b1d5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,15 +32,16 @@ jobs: steps: - template: ci/azure/unit-tests.yml -- job: MacOSX - strategy: - matrix: - py38: - conda_env: py38 - pool: - vmImage: 'macOS-10.13' - steps: - - template: ci/azure/unit-tests.yml +# excluded while waiting for https://github.com/conda-forge/libwebp-feedstock/issues/26 +# - job: MacOSX +# strategy: +# matrix: +# py38: +# conda_env: py38 +# pool: +# vmImage: 'macOS-10.15' +# steps: +# - template: ci/azure/unit-tests.yml - job: Windows strategy: diff --git a/ci/azure/install.yml b/ci/azure/install.yml index 958e3c180fa..60559dd2064 100644 --- a/ci/azure/install.yml +++ b/ci/azure/install.yml @@ -19,7 +19,6 @@ steps: --upgrade \ matplotlib \ numpy \ - pandas \ scipy python -m pip install \ --no-deps \ @@ -30,7 +29,8 @@ steps: git+https://github.com/Unidata/cftime \ git+https://github.com/mapbox/rasterio \ git+https://github.com/hgrecco/pint \ - git+https://github.com/pydata/bottleneck + git+https://github.com/pydata/bottleneck \ + git+https://github.com/pandas-dev/pandas condition: eq(variables['UPSTREAM_DEV'], 'true') displayName: Install upstream dev dependencies diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index 2c44e754cc4..2987303c92a 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -6,21 +6,22 @@ dependencies: - python=3.8 - bottleneck - cartopy - - cfgrib - - h5netcdf + - cfgrib>=0.9 + - dask>=2.10 + - h5netcdf>=0.7.4 - ipykernel - ipython - - iris + - iris>=2.3 - jupyter_client - nbsphinx - - netcdf4 + - netcdf4>=1.5 - numba - - numpy + - numpy>=1.17 - numpydoc - - pandas - - rasterio + - pandas>=1.0 + - rasterio>=1.1 - seaborn - setuptools - - sphinx - - sphinx_rtd_theme - - zarr + - sphinx>=2.3 + - sphinx_rtd_theme>=0.4 + - zarr>=2.4 \ No newline at end of file diff --git a/ci/requirements/py36-min-nep18.yml b/ci/requirements/py36-min-nep18.yml index c10fdf67dc4..a5eded49cd4 100644 --- a/ci/requirements/py36-min-nep18.yml +++ b/ci/requirements/py36-min-nep18.yml @@ -11,7 +11,7 @@ dependencies: - msgpack-python=0.6 # remove once distributed is bumped. distributed GH3491 - numpy=1.17 - pandas=0.25 - - pint=0.9 # Actually not enough as it doesn't implement __array_function__yet! + - pint=0.11 - pip - pytest - pytest-cov diff --git a/conftest.py b/conftest.py index 25dc284975e..712af1d3759 100644 --- a/conftest.py +++ b/conftest.py @@ -21,3 +21,14 @@ def pytest_runtest_setup(item): pytest.skip( "set --run-network-tests to run test requiring an " "internet connection" ) + + +@pytest.fixture(autouse=True) +def add_standard_imports(doctest_namespace): + import numpy as np + import pandas as pd + import xarray as xr + + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["xr"] = xr diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 437f53b1a91..cc9517a98ba 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -379,7 +379,6 @@ Variable.min Variable.no_conflicts Variable.notnull - Variable.pad_with_fill_value Variable.prod Variable.quantile Variable.rank @@ -453,7 +452,6 @@ IndexVariable.min IndexVariable.no_conflicts IndexVariable.notnull - IndexVariable.pad_with_fill_value IndexVariable.prod IndexVariable.quantile IndexVariable.rank diff --git a/doc/api.rst b/doc/api.rst index 8514dff8264..fe01f495e24 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -30,6 +30,7 @@ Top-level functions zeros_like ones_like dot + polyval map_blocks show_versions set_options @@ -165,6 +166,7 @@ Computation Dataset.groupby_bins Dataset.rolling Dataset.rolling_exp + Dataset.weighted Dataset.coarsen Dataset.resample Dataset.diff @@ -172,6 +174,7 @@ Computation Dataset.differentiate Dataset.integrate Dataset.map_blocks + Dataset.polyfit **Aggregation**: :py:attr:`~Dataset.all` @@ -221,6 +224,7 @@ Reshaping and reorganizing Dataset.to_stacked_array Dataset.shift Dataset.roll + Dataset.pad Dataset.sortby Dataset.broadcast_like @@ -341,6 +345,7 @@ Computation DataArray.groupby_bins DataArray.rolling DataArray.rolling_exp + DataArray.weighted DataArray.coarsen DataArray.dt DataArray.resample @@ -350,6 +355,7 @@ Computation DataArray.quantile DataArray.differentiate DataArray.integrate + DataArray.polyfit DataArray.str DataArray.map_blocks @@ -402,6 +408,7 @@ Reshaping and reorganizing DataArray.to_unstacked_dataset DataArray.shift DataArray.roll + DataArray.pad DataArray.sortby DataArray.broadcast_like @@ -578,6 +585,22 @@ Rolling objects core.rolling.DatasetRolling.reduce core.rolling_exp.RollingExp +Weighted objects +================ + +.. autosummary:: + :toctree: generated/ + + core.weighted.DataArrayWeighted + core.weighted.DataArrayWeighted.mean + core.weighted.DataArrayWeighted.sum + core.weighted.DataArrayWeighted.sum_of_weights + core.weighted.DatasetWeighted + core.weighted.DatasetWeighted.mean + core.weighted.DatasetWeighted.sum + core.weighted.DatasetWeighted.sum_of_weights + + Coarsen objects =============== diff --git a/doc/computation.rst b/doc/computation.rst index 1ac30f55ee7..4b8014c4782 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -1,3 +1,5 @@ +.. currentmodule:: xarray + .. _comput: ########### @@ -241,12 +243,94 @@ You can also use ``construct`` to compute a weighted rolling sum: To avoid this, use ``skipna=False`` as the above example. +.. _comput.weighted: + +Weighted array reductions +========================= + +:py:class:`DataArray` and :py:class:`Dataset` objects include :py:meth:`DataArray.weighted` +and :py:meth:`Dataset.weighted` array reduction methods. They currently +support weighted ``sum`` and weighted ``mean``. + +.. ipython:: python + + coords = dict(month=('month', [1, 2, 3])) + + prec = xr.DataArray([1.1, 1.0, 0.9], dims=('month', ), coords=coords) + weights = xr.DataArray([31, 28, 31], dims=('month', ), coords=coords) + +Create a weighted object: + +.. ipython:: python + + weighted_prec = prec.weighted(weights) + weighted_prec + +Calculate the weighted sum: + +.. ipython:: python + + weighted_prec.sum() + +Calculate the weighted mean: + +.. ipython:: python + + weighted_prec.mean(dim="month") + +The weighted sum corresponds to: + +.. ipython:: python + + weighted_sum = (prec * weights).sum() + weighted_sum + +and the weighted mean to: + +.. ipython:: python + + weighted_mean = weighted_sum / weights.sum() + weighted_mean + +However, the functions also take missing values in the data into account: + +.. ipython:: python + + data = xr.DataArray([np.NaN, 2, 4]) + weights = xr.DataArray([8, 1, 1]) + + data.weighted(weights).mean() + +Using ``(data * weights).sum() / weights.sum()`` would (incorrectly) result +in 0.6. + + +If the weights add up to to 0, ``sum`` returns 0: + +.. ipython:: python + + data = xr.DataArray([1.0, 1.0]) + weights = xr.DataArray([-1.0, 1.0]) + + data.weighted(weights).sum() + +and ``mean`` returns ``NaN``: + +.. ipython:: python + + data.weighted(weights).mean() + + +.. note:: + ``weights`` must be a :py:class:`DataArray` and cannot contain missing values. + Missing values can be replaced manually by ``weights.fillna(0)``. + .. _comput.coarsen: Coarsen large arrays ==================== -``DataArray`` and ``Dataset`` objects include a +:py:class:`DataArray` and :py:class:`Dataset` objects include a :py:meth:`~xarray.DataArray.coarsen` and :py:meth:`~xarray.Dataset.coarsen` methods. This supports the block aggregation along multiple dimensions, @@ -317,6 +401,32 @@ trapezoidal rule using their coordinates, and integration along multidimensional coordinate are not supported. +.. _compute.polyfit: + +Fitting polynomials +=================== + +Xarray objects provide an interface for performing linear or polynomial regressions +using the least-squares method. :py:meth:`~xarray.DataArray.polyfit` computes the +best fitting coefficients along a given dimension and for a given order, + +.. ipython:: python + + x = xr.DataArray(np.arange(10), dims=['x'], name='x') + a = xr.DataArray(3 + 4 * x, dims=['x'], coords={'x': x}) + out = a.polyfit(dim='x', deg=1, full=True) + out + +The method outputs a dataset containing the coefficients (and more if `full=True`). +The inverse operation is done with :py:meth:`~xarray.polyval`, + +.. ipython:: python + + xr.polyval(coord=x, coeffs=out.polyfit_coefficients) + +.. note:: + These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`. + .. _compute.broadcasting: Broadcasting by dimension name diff --git a/doc/contributing.rst b/doc/contributing.rst index eb31db24591..f581bcd9741 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -51,8 +51,8 @@ Bug reports must: `_:: ```python - >>> from xarray import Dataset - >>> df = Dataset(...) + >>> import xarray as xr + >>> df = xr.Dataset(...) ... ``` @@ -378,8 +378,8 @@ and then running:: pre-commit install -from the root of the xarray repository. You can skip the pre-commit checks with -``git commit --no-verify``. +from the root of the xarray repository. You can skip the pre-commit checks +with ``git commit --no-verify``. Backwards Compatibility diff --git a/doc/examples.rst b/doc/examples.rst index 3067ca824be..1d48d29bcc5 100644 --- a/doc/examples.rst +++ b/doc/examples.rst @@ -6,6 +6,7 @@ Examples examples/weather-data examples/monthly-means + examples/area_weighted_temperature examples/multidimensional-coords examples/visualization_gallery examples/ROMS_ocean_model @@ -17,3 +18,12 @@ Using apply_ufunc :maxdepth: 2 examples/apply_ufunc_vectorize_1d + +External Examples +----------------- +.. toctree:: + :maxdepth: 2 + + Managing raster data with rioxarray + Xarray with dask + Xarray and dask on the cloud with Pangeo diff --git a/doc/examples/area_weighted_temperature.ipynb b/doc/examples/area_weighted_temperature.ipynb new file mode 100644 index 00000000000..72876e3fc29 --- /dev/null +++ b/doc/examples/area_weighted_temperature.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "toc": true + }, + "source": [ + "

Table of Contents

\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare weighted and unweighted mean temperature\n", + "\n", + "\n", + "Author: [Mathias Hauser](https://github.com/mathause/)\n", + "\n", + "\n", + "We use the `air_temperature` example dataset to calculate the area-weighted temperature over its domain. This dataset has a regular latitude/ longitude grid, thus the gridcell area decreases towards the pole. For this grid we can use the cosine of the latitude as proxy for the grid cell area.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:43:57.222351Z", + "start_time": "2020-03-17T14:43:56.147541Z" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import cartopy.crs as ccrs\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "import xarray as xr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data\n", + "\n", + "Load the data, convert to celsius, and resample to daily values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:43:57.831734Z", + "start_time": "2020-03-17T14:43:57.651845Z" + } + }, + "outputs": [], + "source": [ + "ds = xr.tutorial.load_dataset(\"air_temperature\")\n", + "\n", + "# to celsius\n", + "air = ds.air - 273.15\n", + "\n", + "# resample from 6-hourly to daily values\n", + "air = air.resample(time=\"D\").mean()\n", + "\n", + "air" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the first timestep:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:43:59.887120Z", + "start_time": "2020-03-17T14:43:59.582894Z" + } + }, + "outputs": [], + "source": [ + "projection = ccrs.LambertConformal(central_longitude=-95, central_latitude=45)\n", + "\n", + "f, ax = plt.subplots(subplot_kw=dict(projection=projection))\n", + "\n", + "air.isel(time=0).plot(transform=ccrs.PlateCarree(), cbar_kwargs=dict(shrink=0.7))\n", + "ax.coastlines()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating weights\n", + "\n", + "For a for a rectangular grid the cosine of the latitude is proportional to the grid cell area." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:44:18.777092Z", + "start_time": "2020-03-17T14:44:18.736587Z" + } + }, + "outputs": [], + "source": [ + "weights = np.cos(np.deg2rad(air.lat))\n", + "weights.name = \"weights\"\n", + "weights" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Weighted mean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:44:52.607120Z", + "start_time": "2020-03-17T14:44:52.564674Z" + } + }, + "outputs": [], + "source": [ + "air_weighted = air.weighted(weights)\n", + "air_weighted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:44:54.334279Z", + "start_time": "2020-03-17T14:44:54.280022Z" + } + }, + "outputs": [], + "source": [ + "weighted_mean = air_weighted.mean((\"lon\", \"lat\"))\n", + "weighted_mean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot: comparison with unweighted mean\n", + "\n", + "Note how the weighted mean temperature is higher than the unweighted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-03-17T14:45:08.877307Z", + "start_time": "2020-03-17T14:45:08.673383Z" + } + }, + "outputs": [], + "source": [ + "weighted_mean.plot(label=\"weighted\")\n", + "air.mean((\"lon\", \"lat\")).plot(label=\"unweighted\")\n", + "\n", + "plt.legend()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": true, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc/installing.rst b/doc/installing.rst index dfc2841a956..a25bf65e342 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -11,6 +11,8 @@ Required dependencies - `numpy `__ (1.15 or later) - `pandas `__ (0.25 or later) +.. _optional-dependencies: + Optional dependencies --------------------- @@ -24,7 +26,7 @@ For netCDF and IO - `h5netcdf `__: an alternative library for reading and writing netCDF4 files that does not use the netCDF-C libraries - `pynio `__: for reading GRIB and other - geoscience specific file formats + geoscience specific file formats. Note that pynio is not available for Windows. - `zarr `__: for chunked, compressed, N-dimensional arrays. - `cftime `__: recommended if you want to encode/decode datetimes for non-standard calendars or dates before @@ -121,16 +123,15 @@ xarray itself is a pure Python package, but its dependencies are not. The easiest way to get everything installed is to use conda_. To install xarray with its recommended dependencies using the conda command line tool:: - $ conda install xarray dask netCDF4 bottleneck + $ conda install -c conda-forge xarray dask netCDF4 bottleneck .. _conda: http://conda.io/ -We recommend using the community maintained `conda-forge `__ channel if you need difficult\-to\-build dependencies such as cartopy, pynio or PseudoNetCDF:: - - $ conda install -c conda-forge xarray cartopy pynio pseudonetcdf +If you require other :ref:`optional-dependencies` add them to the line above. -New releases may also appear in conda-forge before being updated in the default -channel. +We recommend using the community maintained `conda-forge `__ channel, +as some of the dependencies are difficult to build. New releases may also appear in conda-forge before +being updated in the default channel. If you don't use conda, be sure you have the required dependencies (numpy and pandas) installed first. Then, install xarray with pip:: diff --git a/doc/io.rst b/doc/io.rst index e910943236f..6064aa3568a 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -759,9 +759,53 @@ for an example of how to convert these to longitudes and latitudes. considered as being experimental. Please report any bug you may find on xarray's github repository. + +Additionally, you can use `rioxarray`_ for reading in GeoTiff, netCDF or other +GDAL readable raster data using `rasterio`_ as well as for exporting to a geoTIFF. +`rioxarray`_ can also handle geospatial related tasks such as re-projecting and clipping. + +.. ipython:: + :verbatim: + + In [1]: import rioxarray + + In [2]: rds = rioxarray.open_rasterio('RGB.byte.tif') + + In [3]: rds + Out[3]: + + [1703814 values with dtype=uint8] + Coordinates: + * band (band) int64 1 2 3 + * y (y) float64 2.827e+06 2.826e+06 ... 2.612e+06 2.612e+06 + * x (x) float64 1.021e+05 1.024e+05 ... 3.389e+05 3.392e+05 + spatial_ref int64 0 + Attributes: + STATISTICS_MAXIMUM: 255 + STATISTICS_MEAN: 29.947726688477 + STATISTICS_MINIMUM: 0 + STATISTICS_STDDEV: 52.340921626611 + transform: (300.0379266750948, 0.0, 101985.0, 0.0, -300.0417827... + _FillValue: 0.0 + scale_factor: 1.0 + add_offset: 0.0 + grid_mapping: spatial_ref + + In [4]: rds.rio.crs + Out[4]: CRS.from_epsg(32618) + + In [5]: rds4326 = rio.rio.reproject("epsg:4326") + + In [6]: rds4326.rio.crs + Out[6]: CRS.from_epsg(4326) + + In [7]: rds4326.rio.to_raster('RGB.byte.4326.tif') + + .. _rasterio: https://rasterio.readthedocs.io/en/latest/ +.. _rioxarray: https://corteva.github.io/rioxarray/stable/ .. _test files: https://github.com/mapbox/rasterio/blob/master/tests/data/RGB.byte.tif -.. _pyproj: https://github.com/jswhit/pyproj +.. _pyproj: https://github.com/pyproj4/pyproj .. _io.zarr: diff --git a/doc/pandas.rst b/doc/pandas.rst index b1660e48dd2..b0ec2a117dc 100644 --- a/doc/pandas.rst +++ b/doc/pandas.rst @@ -110,10 +110,10 @@ Multi-dimensional data Tidy data is great, but it sometimes you want to preserve dimensions instead of automatically stacking them into a ``MultiIndex``. -:py:meth:`DataArray.to_pandas()` is a shortcut that -lets you convert a DataArray directly into a pandas object with the same -dimensionality (i.e., a 1D array is converted to a :py:class:`~pandas.Series`, -2D to :py:class:`~pandas.DataFrame` and 3D to ``pandas.Panel``): +:py:meth:`DataArray.to_pandas()` is a shortcut that lets you convert a +DataArray directly into a pandas object with the same dimensionality, if +available in pandas (i.e., a 1D array is converted to a +:py:class:`~pandas.Series` and 2D to :py:class:`~pandas.DataFrame`): .. ipython:: python @@ -151,13 +151,13 @@ However, you will need to set dimension names explicitly, either with the Transitioning from pandas.Panel to xarray ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``Panel``, pandas' data structure for 3D arrays, has always -been a second class data structure compared to the Series and DataFrame. To -allow pandas developers to focus more on its core functionality built around -the DataFrame, pandas has deprecated ``Panel``. It will be removed in pandas -0.25. +``Panel``, pandas' data structure for 3D arrays, was always a second class +data structure compared to the Series and DataFrame. To allow pandas +developers to focus more on its core functionality built around the +DataFrame, pandas removed ``Panel`` in favor of directing users who use +multi-dimensional arrays to xarray. -xarray has most of ``Panel``'s features, a more explicit API (particularly around +Xarray has most of ``Panel``'s features, a more explicit API (particularly around indexing), and the ability to scale to >3 dimensions with the same interface. As discussed :ref:`elsewhere ` in the docs, there are two primary data structures in @@ -210,7 +210,7 @@ You can also easily convert this data into ``Dataset``: array.to_dataset(dim='dim_0') Here, there are two data variables, each representing a DataFrame on panel's -``items`` axis, and labelled as such. Each variable is a 2D array of the +``items`` axis, and labeled as such. Each variable is a 2D array of the respective values along the ``items`` dimension. While the xarray docs are relatively complete, a few items stand out for Panel users: diff --git a/doc/plotting.rst b/doc/plotting.rst index ea9816780a7..f3d9c0213de 100644 --- a/doc/plotting.rst +++ b/doc/plotting.rst @@ -657,7 +657,7 @@ Additionally, the boolean kwarg ``add_guide`` can be used to prevent the display .. ipython:: python - ds.w.values = [1, 2, 3, 5] + ds = ds.assign(w=[1, 2, 3, 5]) @savefig ds_discrete_legend_hue_scatter.png ds.plot.scatter(x='A', y='B', hue='w', hue_style='discrete') diff --git a/doc/related-projects.rst b/doc/related-projects.rst index 3188751366f..57b8da0c447 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -61,7 +61,9 @@ Extend xarray capabilities - `Collocate `_: Collocate xarray trajectories in arbitrary physical dimensions - `eofs `_: EOF analysis in Python. - `hypothesis-gufunc `_: Extension to hypothesis. Makes it easy to write unit tests with xarray objects as input. +- `nxarray `_: NeXus input/output capability for xarray. - `xarray_extras `_: Advanced algorithms for xarray objects (e.g. integrations/interpolations). +- `xpublish `_: Publish Xarray Datasets via a Zarr compatible REST API. - `xrft `_: Fourier transforms for xarray data. - `xr-scipy `_: A lightweight scipy wrapper for xarray. - `X-regression `_: Multiple linear regression from Statsmodels library coupled with Xarray library. diff --git a/doc/reshaping.rst b/doc/reshaping.rst index 455a24f9216..465ca14dfc2 100644 --- a/doc/reshaping.rst +++ b/doc/reshaping.rst @@ -109,6 +109,13 @@ implemented :py:meth:`~xarray.DataArray.stack` and stacked stacked.unstack('z') +As elsewhere in xarray, an ellipsis (`...`) can be used to represent all unlisted dimensions: + +.. ipython:: python + + stacked = array.stack(z=[..., "x"]) + stacked + These methods are modeled on the :py:class:`pandas.DataFrame` methods of the same name, although in xarray they always create new dimensions rather than adding to the existing index or columns. @@ -164,6 +171,7 @@ like this: 'b': ('x', [6, 7])}, coords={'y': ['u', 'v', 'w']} ) + data stacked = data.to_stacked_array("z", sample_dims=['x']) stacked unstacked = stacked.to_unstacked_dataset("z") diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index 96641c2b97e..9e7c0f1d51d 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -105,6 +105,14 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.time.dt.dayofyear da.time.dt.dayofweek +- Rounding of datetimes to fixed frequencies via the ``dt`` accessor: + +.. ipython:: python + + da.time.dt.ceil('3D') + da.time.dt.floor('5D') + da.time.dt.round('2D') + - Group-by operations based on datetime accessor attributes (e.g. by month of the year): diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1deb77eecfc..a138dee4128 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,32 +13,120 @@ What's New import xarray as xr np.random.seed(123456) -.. _whats-new.0.15.1: +.. _whats-new.0.16.0: + +v0.16.0 (unreleased) +--------------------- + +Breaking changes +~~~~~~~~~~~~~~~~ +- Alternate draw styles for :py:meth:`plot.step` must be passed using the + ``drawstyle`` (or ``ds``) keyword argument, instead of the ``linestyle`` (or + ``ls``) keyword argument, in line with the `upstream change in Matplotlib + `_. + (:pull:`3274`) + By `Elliott Sales de Andrade `_ + +New Features +~~~~~~~~~~~~ +- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) + By `Pascal Bourgault `_. +- Control over attributes of result in :py:func:`merge`, :py:func:`concat`, + :py:func:`combine_by_coords` and :py:func:`combine_nested` using + combine_attrs keyword argument. (:issue:`3865`, :pull:`3877`) + By `John Omotani `_ +- Limited the length of array items with long string reprs to a + reasonable width (:pull:`3900`) + By `Maximilian Roos `_ + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ +- Use ``Fixes`` rather than ``Closes`` in GitHub Pull Request template, allowing + linking to issues. + By `Maximilian Roos `_ -v0.15.1 (unreleased) + +.. _whats-new.0.15.1: + +v0.15.1 (23 Mar 2020) --------------------- +This release brings many new features such as :py:meth:`Dataset.weighted` methods for weighted array +reductions, a new jupyter repr by default, and the start of units integration with pint. There's also +the usual batch of usability improvements, documentation additions, and bug fixes. + Breaking changes ~~~~~~~~~~~~~~~~ +- Raise an error when assigning to the ``.values`` or ``.data`` attribute of + dimension coordinates i.e. ``IndexVariable`` objects. This has been broken since + v0.12.0. Please use :py:meth:`DataArray.assign_coords` or :py:meth:`Dataset.assign_coords` + instead. (:issue:`3470`, :pull:`3862`) + By `Deepak Cherian `_ + New Features ~~~~~~~~~~~~ +- Weighted array reductions are now supported via the new :py:meth:`DataArray.weighted` + and :py:meth:`Dataset.weighted` methods. See :ref:`comput.weighted`. (:issue:`422`, :pull:`2922`). + By `Mathias Hauser `_ +- The new jupyter notebook repr (``Dataset._repr_html_`` and + ``DataArray._repr_html_``) (introduced in 0.14.1) is now on by default. To + disable, use ``xarray.set_options(display_style="text")``. + By `Julia Signell `_. +- Added support for :py:class:`pandas.DatetimeIndex`-style rounding of + ``cftime.datetime`` objects directly via a :py:class:`CFTimeIndex` or via the + :py:class:`~core.accessor_dt.DatetimeAccessor`. + By `Spencer Clark `_ - Support new h5netcdf backend keyword `phony_dims` (available from h5netcdf v0.8.0 for :py:class:`~xarray.backends.H5NetCDFStore`. By `Kai Mühlbauer `_. -- implement pint support. (:issue:`3594`, :pull:`3706`) +- Add partial support for unit aware arrays with pint. (:pull:`3706`, :pull:`3611`) By `Justus Magin `_. - :py:meth:`Dataset.groupby` and :py:meth:`DataArray.groupby` now raise a `TypeError` on multiple string arguments. Receiving multiple string arguments - often means a user is attempting to pass multiple dimensions to group over - and should instead pass a list. + often means a user is attempting to pass multiple dimensions as separate + arguments and should instead pass a single list of dimensions. + (:pull:`3802`) + By `Maximilian Roos `_ +- :py:func:`map_blocks` can now apply functions that add new unindexed dimensions. + By `Deepak Cherian `_ +- An ellipsis (``...``) is now supported in the ``dims`` argument of + :py:meth:`Dataset.stack` and :py:meth:`DataArray.stack`, meaning all + unlisted dimensions, similar to its meaning in :py:meth:`DataArray.transpose`. + (:pull:`3826`) By `Maximilian Roos `_ +- :py:meth:`Dataset.where` and :py:meth:`DataArray.where` accept a lambda as a + first argument, which is then called on the input; replicating pandas' behavior. + By `Maximilian Roos `_. +- ``skipna`` is available in :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile`, + :py:meth:`core.groupby.DatasetGroupBy.quantile`, :py:meth:`core.groupby.DataArrayGroupBy.quantile` + (:issue:`3843`, :pull:`3844`) + By `Aaron Spring `_. Bug fixes ~~~~~~~~~ +- Fix :py:meth:`Dataset.interp` when indexing array shares coordinates with the + indexed variable (:issue:`3252`). + By `David Huard `_. +- Fix recombination of groups in :py:meth:`Dataset.groupby` and + :py:meth:`DataArray.groupby` when performing an operation that changes the + size of the groups along the grouped dimension. By `Eric Jansen + `_. +- Fix use of multi-index with categorical values (:issue:`3674`). + By `Matthieu Ancellin `_. +- Fix alignment with ``join="override"`` when some dimensions are unindexed. (:issue:`3681`). + By `Deepak Cherian `_. - Fix :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims` producing index with name reflecting the previous dimension name instead of the new one (:issue:`3748`, :pull:`3752`). By `Joseph K Aicher @@ -52,21 +140,53 @@ Bug fixes - xarray now respects the over, under and bad colors if set on a provided colormap. (:issue:`3590`, :pull:`3601`) By `johnomotani `_. +- :py:func:`coarsen` now respects ``xr.set_options(keep_attrs=True)`` + to preserve attributes. :py:meth:`Dataset.coarsen` accepts a keyword + argument ``keep_attrs`` to change this setting. (:issue:`3376`, + :pull:`3801`) By `Andrew Thomas `_. +- Delete associated indexes when deleting coordinate variables. (:issue:`3746`). + By `Deepak Cherian `_. +- Fix :py:meth:`xarray.core.dataset.Dataset.to_zarr` when using `append_dim` and `group` + simultaneously. (:issue:`3170`). By `Matthias Meyer `_. +- Fix html repr on :py:class:`Dataset` with non-string keys (:pull:`3807`). + By `Maximilian Roos `_. Documentation ~~~~~~~~~~~~~ +- Fix documentation of :py:class:`DataArray` removing the deprecated mention + that when omitted, `dims` are inferred from a `coords`-dict. (:pull:`3821`) + By `Sander van Rijn `_. +- Improve the :py:func:`where` docstring. + By `Maximilian Roos `_ +- Update the installation instructions: only explicitly list recommended dependencies + (:issue:`3756`). + By `Mathias Hauser `_. + Internal Changes ~~~~~~~~~~~~~~~~ -- Removed the internal ``import_seaborn`` function which handled the deprecation of +- Remove the internal ``import_seaborn`` function which handled the deprecation of the ``seaborn.apionly`` entry point (:issue:`3747`). By `Mathias Hauser `_. - Don't test pint integration in combination with datetime objects. (:issue:`3778`, :pull:`3788`) By `Justus Magin `_. -- Changed test_open_mfdataset_list_attr to only run with dask installed +- Change test_open_mfdataset_list_attr to only run with dask installed (:issue:`3777`, :pull:`3780`). By `Bruno Pagani `_. +- Preserve the ability to index with ``method="nearest"`` with a + :py:class:`CFTimeIndex` with pandas versions greater than 1.0.1 + (:issue:`3751`). By `Spencer Clark `_. +- Greater flexibility and improved test coverage of subtracting various types + of objects from a :py:class:`CFTimeIndex`. By `Spencer Clark + `_. +- Update Azure CI MacOS image, given pending removal. + By `Maximilian Roos `_ +- Remove xfails for scipy 1.0.1 for tests that append to netCDF files (:pull:`3805`). + By `Mathias Hauser `_. +- Remove conversion to :py:class:`pandas.Panel`, given its removal in pandas + in favor of xarray's objects. + By `Maximilian Roos `_ .. _whats-new.0.15.0: @@ -102,6 +222,8 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Implement :py:meth:`DataArray.pad` and :py:meth:`Dataset.pad`. (:issue:`2605`, :pull:`3596`). + By `Mark Boer `_. - :py:meth:`DataArray.sel` and :py:meth:`Dataset.sel` now support :py:class:`pandas.CategoricalIndex`. (:issue:`3669`) By `Keisuke Fujii `_. - Support using an existing, opened h5netcdf ``File`` with @@ -677,12 +799,13 @@ Bug fixes - Plots in 2 dimensions (pcolormesh, contour) now allow to specify levels as numpy array (:issue:`3284`). By `Mathias Hauser `_. - Fixed bug in :meth:`DataArray.quantile` failing to keep attributes when - `keep_attrs` was True (:issue:`3304`). By David Huard ``_. + `keep_attrs` was True (:issue:`3304`). By `David Huard `_. Documentation ~~~~~~~~~~~~~ -- Created a `PR checklist `_ as a quick reference for tasks before creating a new PR +- Created a `PR checklist `_ + as a quick reference for tasks before creating a new PR or pushing new commits. By `Gregory Gundersen `_. diff --git a/readthedocs.yml b/readthedocs.yml index ad249bf8c09..173d61ec6f3 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,13 +1,9 @@ version: 2 build: - image: latest + image: stable conda: environment: ci/requirements/doc.yml -python: - version: 3.8 - install: [] - formats: [] diff --git a/xarray/__init__.py b/xarray/__init__.py index 331d8ecb09a..0fead57e5fb 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -17,7 +17,7 @@ from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, dot, where +from .core.computation import apply_ufunc, dot, polyval, where from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset @@ -65,6 +65,7 @@ "open_mfdataset", "open_rasterio", "open_zarr", + "polyval", "register_dataarray_accessor", "register_dataset_accessor", "save_mfdataset", diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 56cd0649989..c7481e22b59 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1196,8 +1196,8 @@ def save_mfdataset( Save a dataset into one netCDF per year of data: - >>> years, datasets = zip(*ds.groupby('time.year')) - >>> paths = ['%s.nc' % y for y in years] + >>> years, datasets = zip(*ds.groupby("time.year")) + >>> paths = ["%s.nc" % y for y in years] >>> xr.save_mfdataset(datasets, paths) """ if mode == "w" and len(set(paths)) < len(paths): @@ -1253,7 +1253,7 @@ def check_dtype(var): if ( not np.issubdtype(var.dtype, np.number) and not np.issubdtype(var.dtype, np.datetime64) - and not np.issubdtype(var.dtype, np.bool) + and not np.issubdtype(var.dtype, np.bool_) and not coding.strings.is_unicode_dtype(var.dtype) and not var.dtype == object ): diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 763769dac74..cdc74e06882 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -10,13 +10,20 @@ from .common import AbstractWritableDataStore, BackendArray, _encode_variable_name # need some special secret attributes to tell us the dimensions -_DIMENSION_KEY = "_ARRAY_DIMENSIONS" +DIMENSION_KEY = "_ARRAY_DIMENSIONS" -# zarr attributes have to be serializable as json -# many xarray datasets / variables have numpy arrays and values -# these functions handle encoding / decoding of such items -def _encode_zarr_attr_value(value): +def encode_zarr_attr_value(value): + """ + Encode a attribute value as something that can be serialized as json + + Many xarray datasets / variables have numpy arrays and values. This + function handles encoding / decoding of such items. + + ndarray -> list + scalar array -> scalar + other -> other (no change) + """ if isinstance(value, np.ndarray): encoded = value.tolist() # this checks if it's a scalar number @@ -170,7 +177,20 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes -def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): +def extract_zarr_variable_encoding(variable, raise_on_invalid=False): + """ + Extract zarr encoding dictionary from xarray Variable + + Parameters + ---------- + variable : xarray.Variable + raise_on_invalid : bool, optional + + Returns + ------- + encoding : dict + Zarr encoding for `variable` + """ encoding = variable.encoding.copy() valid_encodings = {"chunks", "compressor", "filters", "cache_metadata"} @@ -271,7 +291,7 @@ def __init__(self, zarr_group, consolidate_on_close=False): def open_store_variable(self, name, zarr_array): data = indexing.LazilyOuterIndexedArray(ZarrArrayWrapper(name, self)) - dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY) + dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY) attributes = dict(attributes) encoding = { "chunks": zarr_array.chunks, @@ -298,7 +318,7 @@ def get_dimensions(self): dimensions = {} for k, v in self.ds.arrays(): try: - for d, s in zip(v.attrs[_DIMENSION_KEY], v.shape): + for d, s in zip(v.attrs[DIMENSION_KEY], v.shape): if d in dimensions and dimensions[d] != s: raise ValueError( "found conflicting lengths for dimension %s " @@ -310,7 +330,7 @@ def get_dimensions(self): raise KeyError( "Zarr object is missing the attribute `%s`, " "which is required for xarray to determine " - "variable dimensions." % (_DIMENSION_KEY) + "variable dimensions." % (DIMENSION_KEY) ) return dimensions @@ -328,7 +348,7 @@ def encode_variable(self, variable): return variable def encode_attribute(self, a): - return _encode_zarr_attr_value(a) + return encode_zarr_attr_value(a) def store( self, @@ -373,7 +393,7 @@ def store( if len(existing_variables) > 0: # there are variables to append # their encoding must be the same as in the store - ds = open_zarr(self.ds.store, chunks=None) + ds = open_zarr(self.ds.store, group=self.ds.path, chunks=None) variables_with_encoding = {} for vn in existing_variables: variables_with_encoding[vn] = variables[vn].copy(deep=False) @@ -433,10 +453,10 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No writer.add(v.data, zarr_array, region=tuple(new_region)) else: # new variable - encoding = _extract_zarr_variable_encoding(v, raise_on_invalid=check) + encoding = extract_zarr_variable_encoding(v, raise_on_invalid=check) encoded_attrs = {} # the magic for storing the hidden dimension data - encoded_attrs[_DIMENSION_KEY] = dims + encoded_attrs[DIMENSION_KEY] = dims for k2, v2 in attrs.items(): encoded_attrs[k2] = self.encode_attribute(v2) @@ -487,7 +507,7 @@ def open_zarr( directory in file system where a Zarr DirectoryStore has been stored. synchronizer : object, optional Array synchronizer provided to zarr - group : str, obtional + group : str, optional Group path. (a.k.a. `path` in zarr terminology.) chunks : int or dict or tuple or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index eeb68508527..a2306331ca7 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -938,7 +938,7 @@ def cftime_range( This function returns a ``CFTimeIndex``, populated with ``cftime.datetime`` objects associated with the specified calendar type, e.g. - >>> xr.cftime_range(start='2000', periods=6, freq='2MS', calendar='noleap') + >>> xr.cftime_range(start="2000", periods=6, freq="2MS", calendar="noleap") CFTimeIndex([2000-01-01 00:00:00, 2000-03-01 00:00:00, 2000-05-01 00:00:00, 2000-07-01 00:00:00, 2000-09-01 00:00:00, 2000-11-01 00:00:00], dtype='object') diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 8b440812ca9..2e42702caac 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -49,6 +49,7 @@ from xarray.core.utils import is_scalar +from ..core.common import _contains_cftime_datetimes from .times import _STANDARD_CALENDARS, cftime_to_nptime, infer_calendar_name @@ -252,6 +253,7 @@ def __new__(cls, data, name=None): result = object.__new__(cls) result._data = np.array(data, dtype="O") result.name = name + result._cache = {} return result def _partial_date_slice(self, resolution, parsed): @@ -268,29 +270,32 @@ def _partial_date_slice(self, resolution, parsed): >>> from cftime import DatetimeNoLeap >>> import pandas as pd >>> import xarray as xr - >>> da = xr.DataArray([1, 2], - coords=[[DatetimeNoLeap(2001, 1, 1), - DatetimeNoLeap(2001, 2, 1)]], - dims=['time']) - >>> da.sel(time='2001-01-01') + >>> da = xr.DataArray( + ... [1, 2], + ... coords=[[DatetimeNoLeap(2001, 1, 1), DatetimeNoLeap(2001, 2, 1)]], + ... dims=["time"], + ... ) + >>> da.sel(time="2001-01-01") array([1]) Coordinates: * time (time) object 2001-01-01 00:00:00 - >>> da = xr.DataArray([1, 2], - coords=[[pd.Timestamp(2001, 1, 1), - pd.Timestamp(2001, 2, 1)]], - dims=['time']) - >>> da.sel(time='2001-01-01') + >>> da = xr.DataArray( + ... [1, 2], + ... coords=[[pd.Timestamp(2001, 1, 1), pd.Timestamp(2001, 2, 1)]], + ... dims=["time"], + ... ) + >>> da.sel(time="2001-01-01") array(1) Coordinates: time datetime64[ns] 2001-01-01 - >>> da = xr.DataArray([1, 2], - coords=[[pd.Timestamp(2001, 1, 1, 1), - pd.Timestamp(2001, 2, 1)]], - dims=['time']) - >>> da.sel(time='2001-01-01') + >>> da = xr.DataArray( + ... [1, 2], + ... coords=[[pd.Timestamp(2001, 1, 1, 1), pd.Timestamp(2001, 2, 1)]], + ... dims=["time"], + ... ) + >>> da.sel(time="2001-01-01") array([1]) Coordinates: @@ -326,6 +331,32 @@ def _get_string_slice(self, key): raise KeyError(key) return loc + def _get_nearest_indexer(self, target, limit, tolerance): + """Adapted from pandas.Index._get_nearest_indexer""" + left_indexer = self.get_indexer(target, "pad", limit=limit) + right_indexer = self.get_indexer(target, "backfill", limit=limit) + left_distances = abs(self.values[left_indexer] - target.values) + right_distances = abs(self.values[right_indexer] - target.values) + + if self.is_monotonic_increasing: + condition = (left_distances < right_distances) | (right_indexer == -1) + else: + condition = (left_distances <= right_distances) | (right_indexer == -1) + indexer = np.where(condition, left_indexer, right_indexer) + + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, tolerance) + return indexer + + def _filter_indexer_tolerance(self, target, indexer, tolerance): + """Adapted from pandas.Index._filter_indexer_tolerance""" + if isinstance(target, pd.Index): + distance = abs(self.values[indexer] - target.values) + else: + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) + return indexer + def get_loc(self, key, method=None, tolerance=None): """Adapted from pandas.tseries.index.DatetimeIndex.get_loc""" if isinstance(key, str): @@ -396,10 +427,10 @@ def shift(self, n, freq): Examples -------- - >>> index = xr.cftime_range('2000', periods=1, freq='M') + >>> index = xr.cftime_range("2000", periods=1, freq="M") >>> index CFTimeIndex([2000-01-31 00:00:00], dtype='object') - >>> index.shift(1, 'M') + >>> index.shift(1, "M") CFTimeIndex([2000-02-29 00:00:00], dtype='object') """ from .cftime_offsets import to_offset @@ -427,9 +458,11 @@ def __radd__(self, other): return CFTimeIndex(other + np.array(self)) def __sub__(self, other): - import cftime - - if isinstance(other, (CFTimeIndex, cftime.datetime)): + if _contains_datetime_timedeltas(other): + return CFTimeIndex(np.array(self) - other) + elif isinstance(other, pd.TimedeltaIndex): + return CFTimeIndex(np.array(self) - other.to_pytimedelta()) + elif _contains_cftime_datetimes(np.array(other)): try: return pd.TimedeltaIndex(np.array(self) - np.array(other)) except OverflowError: @@ -437,14 +470,17 @@ def __sub__(self, other): "The time difference exceeds the range of values " "that can be expressed at the nanosecond resolution." ) - - elif isinstance(other, pd.TimedeltaIndex): - return CFTimeIndex(np.array(self) - other.to_pytimedelta()) else: - return CFTimeIndex(np.array(self) - other) + return NotImplemented def __rsub__(self, other): - return pd.TimedeltaIndex(other - np.array(self)) + try: + return pd.TimedeltaIndex(other - np.array(self)) + except OverflowError: + raise ValueError( + "The time difference exceeds the range of values " + "that can be expressed at the nanosecond resolution." + ) def to_datetimeindex(self, unsafe=False): """If possible, convert this index to a pandas.DatetimeIndex. @@ -479,7 +515,7 @@ def to_datetimeindex(self, unsafe=False): Examples -------- >>> import xarray as xr - >>> times = xr.cftime_range('2000', periods=2, calendar='gregorian') + >>> times = xr.cftime_range("2000", periods=2, calendar="gregorian") >>> times CFTimeIndex([2000-01-01 00:00:00, 2000-01-02 00:00:00], dtype='object') >>> times.to_datetimeindex() @@ -518,9 +554,10 @@ def strftime(self, date_format): Examples -------- - >>> rng = xr.cftime_range(start='2000', periods=5, freq='2MS', - ... calendar='noleap') - >>> rng.strftime('%B %d, %Y, %r') + >>> rng = xr.cftime_range( + ... start="2000", periods=5, freq="2MS", calendar="noleap" + ... ) + >>> rng.strftime("%B %d, %Y, %r") Index(['January 01, 2000, 12:00:00 AM', 'March 01, 2000, 12:00:00 AM', 'May 01, 2000, 12:00:00 AM', 'July 01, 2000, 12:00:00 AM', 'September 01, 2000, 12:00:00 AM'], @@ -528,6 +565,83 @@ def strftime(self, date_format): """ return pd.Index([date.strftime(date_format) for date in self._data]) + @property + def asi8(self): + """Convert to integers with units of microseconds since 1970-01-01.""" + from ..core.resample_cftime import exact_cftime_datetime_difference + + epoch = self.date_type(1970, 1, 1) + return np.array( + [ + _total_microseconds(exact_cftime_datetime_difference(epoch, date)) + for date in self.values + ] + ) + + def _round_via_method(self, freq, method): + """Round dates using a specified method.""" + from .cftime_offsets import CFTIME_TICKS, to_offset + + offset = to_offset(freq) + if not isinstance(offset, CFTIME_TICKS): + raise ValueError(f"{offset} is a non-fixed frequency") + + unit = _total_microseconds(offset.as_timedelta()) + values = self.asi8 + rounded = method(values, unit) + return _cftimeindex_from_i8(rounded, self.date_type, self.name) + + def floor(self, freq): + """Round dates down to fixed frequency. + + Parameters + ---------- + freq : str or CFTimeOffset + The frequency level to round the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See `frequency + aliases `_ + for a list of possible values. + + Returns + ------- + CFTimeIndex + """ + return self._round_via_method(freq, _floor_int) + + def ceil(self, freq): + """Round dates up to fixed frequency. + + Parameters + ---------- + freq : str or CFTimeOffset + The frequency level to round the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See `frequency + aliases `_ + for a list of possible values. + + Returns + ------- + CFTimeIndex + """ + return self._round_via_method(freq, _ceil_int) + + def round(self, freq): + """Round dates to a fixed frequency. + + Parameters + ---------- + freq : str or CFTimeOffset + The frequency level to round the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See `frequency + aliases `_ + for a list of possible values. + + Returns + ------- + CFTimeIndex + """ + return self._round_via_method(freq, _round_to_nearest_half_even) + def _parse_iso8601_without_reso(date_type, datetime_str): date, _ = _parse_iso8601_with_reso(date_type, datetime_str) @@ -554,3 +668,67 @@ def _parse_array_of_cftime_strings(strings, date_type): return np.array( [_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()] ).reshape(strings.shape) + + +def _contains_datetime_timedeltas(array): + """Check if an input array contains datetime.timedelta objects.""" + array = np.atleast_1d(array) + return isinstance(array[0], timedelta) + + +def _cftimeindex_from_i8(values, date_type, name): + """Construct a CFTimeIndex from an array of integers. + + Parameters + ---------- + values : np.array + Integers representing microseconds since 1970-01-01. + date_type : cftime.datetime + Type of date for the index. + name : str + Name of the index. + + Returns + ------- + CFTimeIndex + """ + epoch = date_type(1970, 1, 1) + dates = np.array([epoch + timedelta(microseconds=int(value)) for value in values]) + return CFTimeIndex(dates, name=name) + + +def _total_microseconds(delta): + """Compute the total number of microseconds of a datetime.timedelta. + + Parameters + ---------- + delta : datetime.timedelta + Input timedelta. + + Returns + ------- + int + """ + return delta / timedelta(microseconds=1) + + +def _floor_int(values, unit): + """Copied from pandas.""" + return values - np.remainder(values, unit) + + +def _ceil_int(values, unit): + """Copied from pandas.""" + return values + np.remainder(-values, unit) + + +def _round_to_nearest_half_even(values, unit): + """Copied from pandas.""" + if unit % 2: + return _ceil_int(values - unit // 2, unit) + quotient, remainder = np.divmod(values, unit) + mask = np.logical_or( + remainder > (unit // 2), np.logical_and(remainder == (unit // 2), quotient % 2) + ) + quotient[mask] += 1 + return quotient * unit diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index 6d383fcf318..35cc190ffe3 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -201,7 +201,7 @@ class StackedBytesArray(indexing.ExplicitlyIndexedNDArrayMixin): """Wrapper around array-like objects to create a new indexable object where values, when accessed, are automatically stacked along the last dimension. - >>> StackedBytesArray(np.array(['a', 'b', 'c']))[:] + >>> StackedBytesArray(np.array(["a", "b", "c"]))[:] array('abc', dtype='|S3') """ diff --git a/xarray/conventions.py b/xarray/conventions.py index a8b9906c153..df24d0d3d8d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -19,7 +19,7 @@ class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): big endian) into native endianness, so they can be used with Cython functions, such as those found in bottleneck and pandas. - >>> x = np.arange(5, dtype='>i2') + >>> x = np.arange(5, dtype=">i2") >>> x.dtype dtype('>i2') @@ -50,7 +50,7 @@ class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): This is useful for decoding boolean arrays from integer typed netCDF variables. - >>> x = np.array([1, 0, 1, 1, 0], dtype='i1') + >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") >>> x.dtype dtype('>i2') diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index c407371f9f0..2977596036c 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -78,20 +78,27 @@ def _get_date_field(values, name, dtype): return access_method(values, name) -def _round_series(values, name, freq): - """Coerce an array of datetime-like values to a pandas Series and - apply requested rounding +def _round_through_series_or_index(values, name, freq): + """Coerce an array of datetime-like values to a pandas Series or xarray + CFTimeIndex and apply requested rounding """ - values_as_series = pd.Series(values.ravel()) - method = getattr(values_as_series.dt, name) + from ..coding.cftimeindex import CFTimeIndex + + if is_np_datetime_like(values.dtype): + values_as_series = pd.Series(values.ravel()) + method = getattr(values_as_series.dt, name) + else: + values_as_cftimeindex = CFTimeIndex(values.ravel()) + method = getattr(values_as_cftimeindex, name) + field_values = method(freq=freq).values return field_values.reshape(values.shape) def _round_field(values, name, freq): - """Indirectly access pandas rounding functions by wrapping data - as a Series and calling through `.dt` attribute. + """Indirectly access rounding functions by wrapping data + as a Series or CFTimeIndex Parameters ---------- @@ -110,9 +117,12 @@ def _round_field(values, name, freq): if isinstance(values, dask_array_type): from dask.array import map_blocks - return map_blocks(_round_series, values, name, freq=freq, dtype=np.datetime64) + dtype = np.datetime64 if is_np_datetime_like(values.dtype) else np.dtype("O") + return map_blocks( + _round_through_series_or_index, values, name, freq=freq, dtype=dtype + ) else: - return _round_series(values, name, freq) + return _round_through_series_or_index(values, name, freq) def _strftime_through_cftimeindex(values, date_format): @@ -240,8 +250,8 @@ class DatetimeAccessor(Properties): --------- >>> import xarray as xr >>> import pandas as pd - >>> dates = pd.date_range(start='2000/01/01', freq='D', periods=10) - >>> ts = xr.DataArray(dates, dims=('time')) + >>> dates = pd.date_range(start="2000/01/01", freq="D", periods=10) + >>> ts = xr.DataArray(dates, dims=("time")) >>> ts array(['2000-01-01T00:00:00.000000000', '2000-01-02T00:00:00.000000000', @@ -286,8 +296,8 @@ def strftime(self, date_format): Examples -------- - >>> rng = xr.Dataset({'time': datetime.datetime(2000, 1, 1)}) - >>> rng['time'].dt.strftime('%B %d, %Y, %r') + >>> rng = xr.Dataset({"time": datetime.datetime(2000, 1, 1)}) + >>> rng["time"].dt.strftime("%B %d, %Y, %r") array('January 01, 2000, 12:00:00 AM', dtype=object) """ @@ -390,7 +400,7 @@ class TimedeltaAccessor(Properties): >>> import pandas as pd >>> import xarray as xr >>> dates = pd.timedelta_range(start="1 day", freq="6H", periods=20) - >>> ts = xr.DataArray(dates, dims=('time')) + >>> ts = xr.DataArray(dates, dims=("time")) >>> ts array([ 86400000000000, 108000000000000, 129600000000000, 151200000000000, diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 6a975b948eb..5502ba72855 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -67,7 +67,7 @@ class StringAccessor: Similar to pandas, fields can be accessed through the `.str` attribute for applicable DataArrays. - >>> da = xr.DataArray(['some', 'text', 'in', 'an', 'array']) + >>> da = xr.DataArray(["some", "text", "in", "an", "array"]) >>> ds.str.len() array([4, 4, 2, 2, 5]) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 908119f7995..abc180e049c 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -50,7 +50,7 @@ def _override_indexes(objects, all_indexes, exclude): objects = list(objects) for idx, obj in enumerate(objects[1:]): new_indexes = {} - for dim in obj.dims: + for dim in obj.indexes: if dim not in exclude: new_indexes[dim] = all_indexes[dim][0] objects[idx + 1] = obj._overwrite_indexes(new_indexes) @@ -121,10 +121,16 @@ def align( -------- >>> import xarray as xr - >>> x = xr.DataArray([[25, 35], [10, 24]], dims=('lat', 'lon'), - ... coords={'lat': [35., 40.], 'lon': [100., 120.]}) - >>> y = xr.DataArray([[20, 5], [7, 13]], dims=('lat', 'lon'), - ... coords={'lat': [35., 42.], 'lon': [100., 120.]}) + >>> x = xr.DataArray( + ... [[25, 35], [10, 24]], + ... dims=("lat", "lon"), + ... coords={"lat": [35.0, 40.0], "lon": [100.0, 120.0]}, + ... ) + >>> y = xr.DataArray( + ... [[20, 5], [7, 13]], + ... dims=("lat", "lon"), + ... coords={"lat": [35.0, 42.0], "lon": [100.0, 120.0]}, + ... ) >>> x @@ -156,7 +162,7 @@ def align( * lat (lat) float64 35.0 * lon (lon) float64 100.0 120.0 - >>> a, b = xr.align(x, y, join='outer') + >>> a, b = xr.align(x, y, join="outer") >>> a array([[25., 35.], @@ -174,7 +180,7 @@ def align( * lat (lat) float64 35.0 40.0 42.0 * lon (lon) float64 100.0 120.0 - >>> a, b = xr.align(x, y, join='outer', fill_value=-999) + >>> a, b = xr.align(x, y, join="outer", fill_value=-999) >>> a array([[ 25, 35], @@ -192,7 +198,7 @@ def align( * lat (lat) float64 35.0 40.0 42.0 * lon (lon) float64 100.0 120.0 - >>> a, b = xr.align(x, y, join='left') + >>> a, b = xr.align(x, y, join="left") >>> a array([[25, 35], @@ -208,7 +214,7 @@ def align( * lat (lat) float64 35.0 40.0 * lon (lon) float64 100.0 120.0 - >>> a, b = xr.align(x, y, join='right') + >>> a, b = xr.align(x, y, join="right") >>> a array([[25., 35.], @@ -224,13 +230,13 @@ def align( * lat (lat) float64 35.0 42.0 * lon (lon) float64 100.0 120.0 - >>> a, b = xr.align(x, y, join='exact') + >>> a, b = xr.align(x, y, join="exact") Traceback (most recent call last): ... "indexes along dimension {!r} are not equal".format(dim) ValueError: indexes along dimension 'lat' are not equal - >>> a, b = xr.align(x, y, join='override') + >>> a, b = xr.align(x, y, join="override") >>> a array([[25, 35], @@ -674,8 +680,8 @@ def broadcast(*args, exclude=None): Broadcast two data arrays against one another to fill out their dimensions: - >>> a = xr.DataArray([1, 2, 3], dims='x') - >>> b = xr.DataArray([5, 6], dims='y') + >>> a = xr.DataArray([1, 2, 3], dims="x") + >>> b = xr.DataArray([5, 6], dims="y") >>> a array([1, 2, 3]) @@ -706,8 +712,8 @@ def broadcast(*args, exclude=None): Fill out the dimensions of all data variables in a dataset: - >>> ds = xr.Dataset({'a': a, 'b': b}) - >>> ds2, = xr.broadcast(ds) # use tuple unpacking to extract one dataset + >>> ds = xr.Dataset({"a": a, "b": b}) + >>> (ds2,) = xr.broadcast(ds) # use tuple unpacking to extract one dataset >>> ds2 Dimensions: (x: 3, y: 2) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3f6e0e79351..1f990457798 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -155,6 +155,7 @@ def _combine_nd( compat="no_conflicts", fill_value=dtypes.NA, join="outer", + combine_attrs="drop", ): """ Combines an N-dimensional structure of datasets into one by applying a @@ -202,13 +203,21 @@ def _combine_nd( compat=compat, fill_value=fill_value, join=join, + combine_attrs=combine_attrs, ) (combined_ds,) = combined_ids.values() return combined_ds def _combine_all_along_first_dim( - combined_ids, dim, data_vars, coords, compat, fill_value=dtypes.NA, join="outer" + combined_ids, + dim, + data_vars, + coords, + compat, + fill_value=dtypes.NA, + join="outer", + combine_attrs="drop", ): # Group into lines of datasets which must be combined along dim @@ -223,7 +232,7 @@ def _combine_all_along_first_dim( combined_ids = dict(sorted(group)) datasets = combined_ids.values() new_combined_ids[new_id] = _combine_1d( - datasets, dim, compat, data_vars, coords, fill_value, join + datasets, dim, compat, data_vars, coords, fill_value, join, combine_attrs ) return new_combined_ids @@ -236,6 +245,7 @@ def _combine_1d( coords="different", fill_value=dtypes.NA, join="outer", + combine_attrs="drop", ): """ Applies either concat or merge to 1D list of datasets depending on value @@ -252,6 +262,7 @@ def _combine_1d( compat=compat, fill_value=fill_value, join=join, + combine_attrs=combine_attrs, ) except ValueError as err: if "encountered unexpected variable" in str(err): @@ -265,7 +276,13 @@ def _combine_1d( else: raise else: - combined = merge(datasets, compat=compat, fill_value=fill_value, join=join) + combined = merge( + datasets, + compat=compat, + fill_value=fill_value, + join=join, + combine_attrs=combine_attrs, + ) return combined @@ -284,6 +301,7 @@ def _nested_combine( ids, fill_value=dtypes.NA, join="outer", + combine_attrs="drop", ): if len(datasets) == 0: @@ -311,6 +329,7 @@ def _nested_combine( coords=coords, fill_value=fill_value, join=join, + combine_attrs=combine_attrs, ) return combined @@ -323,6 +342,7 @@ def combine_nested( coords="different", fill_value=dtypes.NA, join="outer", + combine_attrs="drop", ): """ Explicitly combine an N-dimensional grid of datasets into one by using a @@ -390,6 +410,16 @@ def combine_nested( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. + combine_attrs : {'drop', 'identical', 'no_conflicts', 'override'}, + default 'drop' + String indicating how to combine attrs of the objects being merged: + + - 'drop': empty attrs on returned Dataset. + - 'identical': all attrs must be the same on every object. + - 'no_conflicts': attrs from all objects are combined, any that have + the same name must also have the same value. + - 'override': skip comparing and copy attrs from the first dataset to + the result. Returns ------- @@ -412,7 +442,7 @@ def combine_nested( precipitation (x, y) float64 5.904 2.453 3.404 ... >>> ds_grid = [[x1y1, x1y2], [x2y1, x2y2]] - >>> combined = xr.combine_nested(ds_grid, concat_dim=['x', 'y']) + >>> combined = xr.combine_nested(ds_grid, concat_dim=["x", "y"]) Dimensions: (x: 4, y: 4) Dimensions without coordinates: x, y @@ -441,7 +471,7 @@ def combine_nested( precipitation (t) float64 5.904 2.453 3.404 ... >>> ds_grid = [[t1temp, t1precip], [t2temp, t2precip]] - >>> combined = xr.combine_nested(ds_grid, concat_dim=['t', None]) + >>> combined = xr.combine_nested(ds_grid, concat_dim=["t", None]) Dimensions: (t: 10) Dimensions without coordinates: t @@ -468,6 +498,7 @@ def combine_nested( ids=False, fill_value=fill_value, join=join, + combine_attrs=combine_attrs, ) @@ -482,6 +513,7 @@ def combine_by_coords( coords="different", fill_value=dtypes.NA, join="outer", + combine_attrs="no_conflicts", ): """ Attempt to auto-magically combine the given datasets into one by using @@ -557,6 +589,16 @@ def combine_by_coords( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. + combine_attrs : {'drop', 'identical', 'no_conflicts', 'override'}, + default 'drop' + String indicating how to combine attrs of the objects being merged: + + - 'drop': empty attrs on returned Dataset. + - 'identical': all attrs must be the same on every object. + - 'no_conflicts': attrs from all objects are combined, any that have + the same name must also have the same value. + - 'override': skip comparing and copy attrs from the first dataset to + the result. Returns ------- @@ -650,7 +692,7 @@ def combine_by_coords( temperature (y, x) float64 1.654 10.63 7.015 nan ... nan 12.46 2.22 15.96 precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953 - >>> xr.combine_by_coords([x3, x1], join='override') + >>> xr.combine_by_coords([x3, x1], join="override") Dimensions: (x: 3, y: 4) Coordinates: @@ -700,6 +742,7 @@ def combine_by_coords( compat=compat, fill_value=fill_value, join=join, + combine_attrs=combine_attrs, ) # Check the overall coordinates are monotonically increasing @@ -717,6 +760,7 @@ def combine_by_coords( compat=compat, fill_value=fill_value, join=join, + combine_attrs=combine_attrs, ) diff --git a/xarray/core/common.py b/xarray/core/common.py index 582ae310061..a003642076f 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -418,9 +418,9 @@ def assign_coords(self, coords=None, **coords_kwargs): -------- Convert longitude coordinates from 0-359 to -180-179: - >>> da = xr.DataArray(np.random.rand(4), - ... coords=[np.array([358, 359, 0, 1])], - ... dims='lon') + >>> da = xr.DataArray( + ... np.random.rand(4), coords=[np.array([358, 359, 0, 1])], dims="lon", + ... ) >>> da array([0.28298 , 0.667347, 0.657938, 0.177683]) @@ -434,7 +434,7 @@ def assign_coords(self, coords=None, **coords_kwargs): The function also accepts dictionary arguments: - >>> da.assign_coords({'lon': (((da.lon + 180) % 360) - 180)}) + >>> da.assign_coords({"lon": (((da.lon + 180) % 360) - 180)}) array([0.28298 , 0.667347, 0.657938, 0.177683]) Coordinates: @@ -518,19 +518,13 @@ def pipe( You can write - >>> (ds.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe(f, arg2=b, arg3=c) - ... ) + >>> (ds.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)) If you have a function that takes the data as (say) the second argument, pass a tuple indicating which keyword expects the data. For example, suppose ``f`` takes its data as ``arg2``: - >>> (ds.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe((f, 'arg2'), arg1=a, arg3=c) - ... ) + >>> (ds.pipe(h).pipe(g, arg1=a).pipe((f, "arg2"), arg1=a, arg3=c)) Examples -------- @@ -539,7 +533,10 @@ def pipe( >>> import xarray as xr >>> x = xr.Dataset( ... { - ... "temperature_c": (("lat", "lon"), 20 * np.random.rand(4).reshape(2, 2)), + ... "temperature_c": ( + ... ("lat", "lon"), + ... 20 * np.random.rand(4).reshape(2, 2), + ... ), ... "precipitation": (("lat", "lon"), np.random.rand(4).reshape(2, 2)), ... }, ... coords={"lat": [10, 20], "lon": [150, 160]}, @@ -584,10 +581,9 @@ def pipe( precipitation (lat, lon) float64 2.731 2.719 2.848 2.467 >>> ( - ... x - ... .pipe(adder, arg=2) - ... .pipe(div, arg=2) - ... .pipe(sub_mult, sub_arg=2, mult_arg=2) + ... x.pipe(adder, arg=2) + ... .pipe(div, arg=2) + ... .pipe(sub_mult, sub_arg=2, mult_arg=2) ... ) Dimensions: (lat: 2, lon: 2) @@ -639,16 +635,17 @@ def groupby(self, group, squeeze: bool = True, restore_coord_dims: bool = None): -------- Calculate daily anomalies for daily data: - >>> da = xr.DataArray(np.linspace(0, 1826, num=1827), - ... coords=[pd.date_range('1/1/2000', '31/12/2004', - ... freq='D')], - ... dims='time') + >>> da = xr.DataArray( + ... np.linspace(0, 1826, num=1827), + ... coords=[pd.date_range("1/1/2000", "31/12/2004", freq="D")], + ... dims="time", + ... ) >>> da array([0.000e+00, 1.000e+00, 2.000e+00, ..., 1.824e+03, 1.825e+03, 1.826e+03]) Coordinates: * time (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ... - >>> da.groupby('time.dayofyear') - da.groupby('time.dayofyear').mean('time') + >>> da.groupby("time.dayofyear") - da.groupby("time.dayofyear").mean("time") array([-730.8, -730.8, -730.8, ..., 730.2, 730.2, 730.5]) Coordinates: @@ -748,11 +745,31 @@ def groupby_bins( }, ) + def weighted(self, weights): + """ + Weighted operations. + + Parameters + ---------- + weights : DataArray + An array of weights associated with the values in this Dataset. + Each value in the data contributes to the reduction operation + according to its associated weight. + + Notes + ----- + ``weights`` must be a DataArray and cannot contain missing values. + Missing values can be replaced by ``weights.fillna(0)``. + """ + + return self._weighted_cls(self, weights) + def rolling( self, dim: Mapping[Hashable, int] = None, min_periods: int = None, center: bool = False, + keep_attrs: bool = None, **window_kwargs: int, ): """ @@ -769,6 +786,10 @@ def rolling( setting min_periods equal to the size of the window. center : boolean, default False Set the labels at the center of the window. + keep_attrs : bool, optional + If True, the object's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. **window_kwargs : optional The keyword arguments form of ``dim``. One of dim or window_kwargs must be provided. @@ -782,10 +803,15 @@ def rolling( -------- Create rolling seasonal average of monthly data e.g. DJF, JFM, ..., SON: - >>> da = xr.DataArray(np.linspace(0, 11, num=12), - ... coords=[pd.date_range('15/12/1999', - ... periods=12, freq=pd.DateOffset(months=1))], - ... dims='time') + >>> da = xr.DataArray( + ... np.linspace(0, 11, num=12), + ... coords=[ + ... pd.date_range( + ... "15/12/1999", periods=12, freq=pd.DateOffset(months=1), + ... ) + ... ], + ... dims="time", + ... ) >>> da array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.]) @@ -799,7 +825,7 @@ def rolling( Remove the NaNs using ``dropna()``: - >>> da.rolling(time=3, center=True).mean().dropna('time') + >>> da.rolling(time=3, center=True).mean().dropna("time") array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) Coordinates: @@ -810,8 +836,13 @@ def rolling( core.rolling.DataArrayRolling core.rolling.DatasetRolling """ + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + dim = either_dict_or_kwargs(dim, window_kwargs, "rolling") - return self._rolling_cls(self, dim, min_periods=min_periods, center=center) + return self._rolling_cls( + self, dim, min_periods=min_periods, center=center, keep_attrs=keep_attrs + ) def rolling_exp( self, @@ -859,6 +890,7 @@ def coarsen( boundary: str = "exact", side: Union[str, Mapping[Hashable, str]] = "left", coord_func: str = "mean", + keep_attrs: bool = None, **window_kwargs: int, ): """ @@ -879,8 +911,12 @@ def coarsen( multiple of the window size. If 'trim', the excess entries are dropped. If 'pad', NA will be padded. side : 'left' or 'right' or mapping from dimension to 'left' or 'right' - coord_func : function (name) that is applied to the coordintes, + coord_func : function (name) that is applied to the coordinates, or a mapping from coordinate name to function (name). + keep_attrs : bool, optional + If True, the object's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -891,10 +927,11 @@ def coarsen( -------- Coarsen the long time series by averaging over every four days. - >>> da = xr.DataArray(np.linspace(0, 364, num=364), - ... dims='time', - ... coords={'time': pd.date_range( - ... '15/12/1999', periods=364)}) + >>> da = xr.DataArray( + ... np.linspace(0, 364, num=364), + ... dims="time", + ... coords={"time": pd.date_range("15/12/1999", periods=364)}, + ... ) >>> da array([ 0. , 1.002755, 2.00551 , ..., 361.99449 , 362.997245, @@ -902,7 +939,7 @@ def coarsen( Coordinates: * time (time) datetime64[ns] 1999-12-15 1999-12-16 ... 2000-12-12 >>> - >>> da.coarsen(time=3, boundary='trim').mean() + >>> da.coarsen(time=3, boundary="trim").mean() array([ 1.002755, 4.011019, 7.019284, ..., 358.986226, 361.99449 ]) @@ -915,9 +952,17 @@ def coarsen( core.rolling.DataArrayCoarsen core.rolling.DatasetCoarsen """ + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + dim = either_dict_or_kwargs(dim, window_kwargs, "coarsen") return self._coarsen_cls( - self, dim, boundary=boundary, side=side, coord_func=coord_func + self, + dim, + boundary=boundary, + side=side, + coord_func=coord_func, + keep_attrs=keep_attrs, ) def resample( @@ -977,10 +1022,15 @@ def resample( -------- Downsample monthly time-series data to seasonal data: - >>> da = xr.DataArray(np.linspace(0, 11, num=12), - ... coords=[pd.date_range('15/12/1999', - ... periods=12, freq=pd.DateOffset(months=1))], - ... dims='time') + >>> da = xr.DataArray( + ... np.linspace(0, 11, num=12), + ... coords=[ + ... pd.date_range( + ... "15/12/1999", periods=12, freq=pd.DateOffset(months=1), + ... ) + ... ], + ... dims="time", + ... ) >>> da array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.]) @@ -994,7 +1044,7 @@ def resample( Upsample monthly time-series data to daily data: - >>> da.resample(time='1D').interpolate('linear') + >>> da.resample(time="1D").interpolate("linear") array([ 0. , 0.032258, 0.064516, ..., 10.935484, 10.967742, 11. ]) Coordinates: @@ -1002,7 +1052,7 @@ def resample( Limit scope of upsampling method - >>> da.resample(time='1D').nearest(tolerance='1D') + >>> da.resample(time="1D").nearest(tolerance="1D") array([ 0., 0., nan, ..., nan, 11., 11.]) Coordinates: @@ -1095,7 +1145,16 @@ def where(self, cond, other=dtypes.NA, drop: bool = False): -------- >>> import numpy as np - >>> a = xr.DataArray(np.arange(25).reshape(5, 5), dims=('x', 'y')) + >>> a = xr.DataArray(np.arange(25).reshape(5, 5), dims=("x", "y")) + >>> a + + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14], + [15, 16, 17, 18, 19], + [20, 21, 22, 23, 24]]) + Dimensions without coordinates: x, y + >>> a.where(a.x + a.y < 4) array([[ 0., 1., 2., 3., nan], @@ -1104,6 +1163,7 @@ def where(self, cond, other=dtypes.NA, drop: bool = False): [ 15., nan, nan, nan, nan], [ nan, nan, nan, nan, nan]]) Dimensions without coordinates: x, y + >>> a.where(a.x + a.y < 5, -1) array([[ 0, 1, 2, 3, 4], @@ -1112,6 +1172,7 @@ def where(self, cond, other=dtypes.NA, drop: bool = False): [15, 16, -1, -1, -1], [20, -1, -1, -1, -1]]) Dimensions without coordinates: x, y + >>> a.where(a.x + a.y < 4, drop=True) array([[ 0., 1., 2., 3.], @@ -1120,6 +1181,14 @@ def where(self, cond, other=dtypes.NA, drop: bool = False): [ 15., nan, nan, nan]]) Dimensions without coordinates: x, y + >>> a.where(lambda x: x.x + x.y < 4, drop=True) + + array([[ 0., 1., 2., 3.], + [ 5., 6., 7., nan], + [ 10., 11., nan, nan], + [ 15., nan, nan, nan]]) + Dimensions without coordinates: x, y + See also -------- numpy.where : corresponding numpy function @@ -1129,6 +1198,9 @@ def where(self, cond, other=dtypes.NA, drop: bool = False): from .dataarray import DataArray from .dataset import Dataset + if callable(cond): + cond = cond(self) + if drop: if other is not dtypes.NA: raise ValueError("cannot set `other` if drop=True") @@ -1182,7 +1254,7 @@ def isin(self, test_elements): Examples -------- - >>> array = xr.DataArray([1, 2, 3], dims='x') + >>> array = xr.DataArray([1, 2, 3], dims="x") >>> array.isin([1, 3]) array([ True, False, True]) @@ -1251,9 +1323,11 @@ def full_like(other, fill_value, dtype: DTypeLike = None): >>> import numpy as np >>> import xarray as xr - >>> x = xr.DataArray(np.arange(6).reshape(2, 3), - ... dims=['lat', 'lon'], - ... coords={'lat': [1, 2], 'lon': [0, 1, 2]}) + >>> x = xr.DataArray( + ... np.arange(6).reshape(2, 3), + ... dims=["lat", "lon"], + ... coords={"lat": [1, 2], "lon": [0, 1, 2]}, + ... ) >>> x array([[0, 1, 2], @@ -1365,9 +1439,11 @@ def zeros_like(other, dtype: DTypeLike = None): >>> import numpy as np >>> import xarray as xr - >>> x = xr.DataArray(np.arange(6).reshape(2, 3), - ... dims=['lat', 'lon'], - ... coords={'lat': [1, 2], 'lon': [0, 1, 2]}) + >>> x = xr.DataArray( + ... np.arange(6).reshape(2, 3), + ... dims=["lat", "lon"], + ... coords={"lat": [1, 2], "lon": [0, 1, 2]}, + ... ) >>> x array([[0, 1, 2], @@ -1423,9 +1499,11 @@ def ones_like(other, dtype: DTypeLike = None): >>> import numpy as np >>> import xarray as xr - >>> x = xr.DataArray(np.arange(6).reshape(2, 3), - ... dims=['lat', 'lon'], - ... coords={'lat': [1, 2], 'lon': [0, 1, 2]}) + >>> x = xr.DataArray( + ... np.arange(6).reshape(2, 3), + ... dims=["lat", "lon"], + ... coords={"lat": [1, 2], "lon": [0, 1, 2]}, + ... ) >>> x array([[0, 1, 2], @@ -1434,7 +1512,7 @@ def ones_like(other, dtype: DTypeLike = None): * lat (lat) int64 1 2 * lon (lon) int64 0 1 2 - >>> >>> xr.ones_like(x) + >>> xr.ones_like(x) array([[1, 1, 1], [1, 1, 1]]) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index d2c5c32bc00..13bf6248331 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -889,7 +889,7 @@ def apply_ufunc( You can now apply ``magnitude()`` to ``xr.DataArray`` and ``xr.Dataset`` objects, with automatically preserved dimensions and coordinates, e.g., - >>> array = xr.DataArray([1, 2, 3], coords=[('x', [0.1, 0.2, 0.3])]) + >>> array = xr.DataArray([1, 2, 3], coords=[("x", [0.1, 0.2, 0.3])]) >>> magnitude(array, -array) array([1.414214, 2.828427, 4.242641]) @@ -1093,10 +1093,9 @@ def dot(*arrays, dims=None, **kwargs): >>> import numpy as np >>> import xarray as xr - >>> da_a = xr.DataArray(np.arange(3 * 2).reshape(3, 2), dims=['a', 'b']) - >>> da_b = xr.DataArray(np.arange(3 * 2 * 2).reshape(3, 2, 2), - ... dims=['a', 'b', 'c']) - >>> da_c = xr.DataArray(np.arange(2 * 3).reshape(2, 3), dims=['c', 'd']) + >>> da_a = xr.DataArray(np.arange(3 * 2).reshape(3, 2), dims=["a", "b"]) + >>> da_b = xr.DataArray(np.arange(3 * 2 * 2).reshape(3, 2, 2), dims=["a", "b", "c"]) + >>> da_c = xr.DataArray(np.arange(2 * 3).reshape(2, 3), dims=["c", "d"]) >>> da_a @@ -1121,18 +1120,18 @@ def dot(*arrays, dims=None, **kwargs): [3, 4, 5]]) Dimensions without coordinates: c, d - >>> xr.dot(da_a, da_b, dims=['a', 'b']) + >>> xr.dot(da_a, da_b, dims=["a", "b"]) array([110, 125]) Dimensions without coordinates: c - >>> xr.dot(da_a, da_b, dims=['a']) + >>> xr.dot(da_a, da_b, dims=["a"]) array([[40, 46], [70, 79]]) Dimensions without coordinates: b, c - >>> xr.dot(da_a, da_b, da_c, dims=['b', 'c']) + >>> xr.dot(da_a, da_b, da_c, dims=["b", "c"]) array([[ 9, 14, 19], [ 93, 150, 207], @@ -1225,9 +1224,13 @@ def where(cond, x, y): ---------- cond : scalar, array, Variable, DataArray or Dataset with boolean dtype When True, return values from `x`, otherwise returns values from `y`. - x, y : scalar, array, Variable, DataArray or Dataset - Values from which to choose. All dimension coordinates on these objects - must be aligned with each other and with `cond`. + x : scalar, array, Variable, DataArray or Dataset + values to choose from where `cond` is True + y : scalar, array, Variable, DataArray or Dataset + values to choose from where `cond` is False + + All dimension coordinates on these objects must be aligned with each + other and with `cond`. Returns ------- @@ -1238,21 +1241,25 @@ def where(cond, x, y): -------- >>> import xarray as xr >>> import numpy as np - >>> x = xr.DataArray(0.1 * np.arange(10), dims=['lat'], - ... coords={'lat': np.arange(10)}, name='sst') + >>> x = xr.DataArray( + ... 0.1 * np.arange(10), + ... dims=["lat"], + ... coords={"lat": np.arange(10)}, + ... name="sst", + ... ) >>> x array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) Coordinates: * lat (lat) int64 0 1 2 3 4 5 6 7 8 9 - >>> xr.where(x < 0.5, x, 100*x) + >>> xr.where(x < 0.5, x, x * 100) array([ 0. , 0.1, 0.2, 0.3, 0.4, 50. , 60. , 70. , 80. , 90. ]) Coordinates: * lat (lat) int64 0 1 2 3 4 5 6 7 8 9 - >>> >>> y = xr.DataArray( + >>> y = xr.DataArray( ... 0.1 * np.arange(9).reshape(3, 3), ... dims=["lat", "lon"], ... coords={"lat": np.arange(3), "lon": 10 + np.arange(3)}, @@ -1276,8 +1283,8 @@ def where(cond, x, y): * lat (lat) int64 0 1 2 * lon (lon) int64 10 11 12 - >>> cond = xr.DataArray([True, False], dims=['x']) - >>> x = xr.DataArray([1, 2], dims=['y']) + >>> cond = xr.DataArray([True, False], dims=["x"]) + >>> x = xr.DataArray([1, 2], dims=["y"]) >>> xr.where(cond, x, 0) array([[1, 2], @@ -1299,3 +1306,35 @@ def where(cond, x, y): dataset_join="exact", dask="allowed", ) + + +def polyval(coord, coeffs, degree_dim="degree"): + """Evaluate a polynomial at specific values + + Parameters + ---------- + coord : DataArray + The 1D coordinate along which to evaluate the polynomial. + coeffs : DataArray + Coefficients of the polynomials. + degree_dim : str, default "degree" + Name of the polynomial degree dimension in `coeffs`. + + See also + -------- + xarray.DataArray.polyfit + numpy.polyval + """ + from .dataarray import DataArray + from .missing import get_clean_interp_index + + x = get_clean_interp_index(coord, coord.name) + + deg_coord = coeffs[degree_dim] + + lhs = DataArray( + np.vander(x, int(deg_coord.max()) + 1), + dims=(coord.name, degree_dim), + coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)[::-1]}, + ) + return (lhs * coeffs).sum(degree_dim) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 96b4be15d1b..7741cbb826b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -3,7 +3,7 @@ from . import dtypes, utils from .alignment import align from .duck_array_ops import lazy_array_equiv -from .merge import _VALID_COMPAT, unique_variable +from .merge import _VALID_COMPAT, merge_attrs, unique_variable from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars @@ -17,6 +17,7 @@ def concat( positions=None, fill_value=dtypes.NA, join="outer", + combine_attrs="override", ): """Concatenate xarray objects along a new or existing dimension. @@ -92,15 +93,21 @@ def concat( - 'override': if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. + combine_attrs : {'drop', 'identical', 'no_conflicts', 'override'}, + default 'override + String indicating how to combine attrs of the objects being merged: + + - 'drop': empty attrs on returned Dataset. + - 'identical': all attrs must be the same on every object. + - 'no_conflicts': attrs from all objects are combined, any that have + the same name must also have the same value. + - 'override': skip comparing and copy attrs from the first dataset to + the result. Returns ------- concatenated : type of objs - Notes - ----- - Each concatenated Variable preserves corresponding ``attrs`` from the first element of ``objs``. - See also -------- merge @@ -132,7 +139,9 @@ def concat( "can only concatenate xarray Dataset and DataArray " "objects, got %s" % type(first_obj) ) - return f(objs, dim, data_vars, coords, compat, positions, fill_value, join) + return f( + objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs + ) def _calc_concat_dim_coord(dim): @@ -306,6 +315,7 @@ def _dataset_concat( positions, fill_value=dtypes.NA, join="outer", + combine_attrs="override", ): """ Concatenate a sequence of datasets along a new or existing dimension @@ -362,7 +372,7 @@ def _dataset_concat( result_vars.update(dim_coords) # assign attrs and encoding from first dataset - result_attrs = datasets[0].attrs + result_attrs = merge_attrs([ds.attrs for ds in datasets], combine_attrs) result_encoding = datasets[0].encoding # check that global attributes are fixed across all datasets if necessary @@ -425,6 +435,7 @@ def _dataarray_concat( positions, fill_value=dtypes.NA, join="outer", + combine_attrs="override", ): arrays = list(arrays) @@ -453,5 +464,12 @@ def _dataarray_concat( positions, fill_value=fill_value, join=join, + combine_attrs="drop", ) - return arrays[0]._from_temp_dataset(ds, name) + + merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) + + result = arrays[0]._from_temp_dataset(ds, name) + result.attrs = merged_attrs + + return result diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 3d51c9b4271..83c4d2a8636 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -247,7 +247,7 @@ def __delitem__(self, key: Hashable) -> None: if key in self: del self._data[key] else: - raise KeyError(key) + raise KeyError(f"{key!r} is not a coordinate variable.") def _ipython_key_completions_(self): """Provide method for the key-autocompletions in IPython. """ @@ -291,7 +291,7 @@ def _update_coords( dims = calculate_dimensions(coords_plus_data) if not set(dims) <= set(self.dims): raise ValueError( - "cannot add coordinates with new dimensions to " "a DataArray" + "cannot add coordinates with new dimensions to a DataArray" ) self._data._coords = coords @@ -312,7 +312,12 @@ def to_dataset(self) -> "Dataset": return Dataset._construct_direct(coords, set(coords)) def __delitem__(self, key: Hashable) -> None: - del self._data._coords[key] + if key in self: + del self._data._coords[key] + if self._data._indexes is not None and key in self._data._indexes: + del self._data._indexes[key] + else: + raise KeyError(f"{key!r} is not a coordinate variable.") def _ipython_key_completions_(self): """Provide method for the key-autocompletions in IPython. """ diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py index 05f750a1355..94c50d90e84 100644 --- a/xarray/core/dask_array_compat.py +++ b/xarray/core/dask_array_compat.py @@ -1,3 +1,4 @@ +import warnings from distutils.version import LooseVersion from typing import Iterable @@ -99,6 +100,52 @@ def meta_from_array(x, ndim=None, dtype=None): return meta +def _validate_pad_output_shape(input_shape, pad_width, output_shape): + """ Validates the output shape of dask.array.pad, raising a RuntimeError if they do not match. + In the current versions of dask (2.2/2.4), dask.array.pad with mode='reflect' sometimes returns + an invalid shape. + """ + isint = lambda i: isinstance(i, int) + + if isint(pad_width): + pass + elif len(pad_width) == 2 and all(map(isint, pad_width)): + pad_width = sum(pad_width) + elif ( + len(pad_width) == len(input_shape) + and all(map(lambda x: len(x) == 2, pad_width)) + and all((isint(i) for p in pad_width for i in p)) + ): + pad_width = np.sum(pad_width, axis=1) + else: + # unreachable: dask.array.pad should already have thrown an error + raise ValueError("Invalid value for `pad_width`") + + if not np.array_equal(np.array(input_shape) + pad_width, output_shape): + raise RuntimeError( + "There seems to be something wrong with the shape of the output of dask.array.pad, " + "try upgrading Dask, use a different pad mode e.g. mode='constant' or first convert " + "your DataArray/Dataset to one backed by a numpy array by calling the `compute()` method." + "See: https://github.com/dask/dask/issues/5303" + ) + + +def pad(array, pad_width, mode="constant", **kwargs): + padded = da.pad(array, pad_width, mode=mode, **kwargs) + # workaround for inconsistency between numpy and dask: https://github.com/dask/dask/issues/5303 + if mode == "mean" and issubclass(array.dtype.type, np.integer): + warnings.warn( + 'dask.array.pad(mode="mean") converts integers to floats. xarray converts ' + "these floats back to integers to keep the interface consistent. There is a chance that " + "this introduces rounding errors. If you wish to keep the values as floats, first change " + "the dtype to a float before calling pad.", + UserWarning, + ) + return da.round(padded).astype(array.dtype) + _validate_pad_output_shape(array.shape, pad_width, padded.shape) + return padded + + if LooseVersion(dask_version) >= LooseVersion("2.8.1"): median = da.median else: diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 37f261cc3ad..87f646352eb 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -95,3 +95,30 @@ def func(x, window, axis=-1): # crop boundary. index = (slice(None),) * axis + (slice(drop_size, drop_size + orig_shape[axis]),) return out[index] + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + import dask.array as da + + lhs_da = da.from_array(lhs, chunks=(rhs.chunks[0], lhs.shape[1])) + if skipna: + added_dim = rhs.ndim == 1 + if added_dim: + rhs = rhs.reshape(rhs.shape[0], 1) + results = da.apply_along_axis( + nputils._nanpolyfit_1d, + 0, + rhs, + lhs_da, + dtype=float, + shape=(lhs.shape[1] + 1,), + rcond=rcond, + ) + coeffs = results[:-1, ...] + residuals = results[-1, ...] + if added_dim: + coeffs = coeffs.reshape(coeffs.shape[0]) + residuals = residuals.reshape(residuals.shape[0]) + else: + coeffs, residuals, _, _ = da.linalg.lstsq(lhs_da, rhs) + return coeffs, residuals diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e3c81541760..bc6c730c879 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -33,6 +33,7 @@ resample, rolling, utils, + weighted, ) from .accessor_dt import CombinedDatetimelikeAccessor from .accessor_str import StringAccessor @@ -258,6 +259,7 @@ class DataArray(AbstractArray, DataWithCoords): _rolling_cls = rolling.DataArrayRolling _coarsen_cls = rolling.DataArrayCoarsen _resample_cls = resample.DataArrayResample + _weighted_cls = weighted.DataArrayWeighted dt = property(CombinedDatetimelikeAccessor) @@ -304,8 +306,7 @@ def __init__( Name(s) of the data dimension(s). Must be either a hashable (only for 1D data) or a sequence of hashables with length equal to the number of dimensions. If this argument is omitted, dimension names - are taken from ``coords`` (if possible) and otherwise default to - ``['dim_0', ... 'dim_n']``. + default to ``['dim_0', ... 'dim_n']``. name : str or None, optional Name of this array. attrs : dict_like or None, optional @@ -474,7 +475,13 @@ def _to_dataset_whole( dataset = Dataset._construct_direct(variables, coord_names, indexes=indexes) return dataset - def to_dataset(self, dim: Hashable = None, *, name: Hashable = None) -> Dataset: + def to_dataset( + self, + dim: Hashable = None, + *, + name: Hashable = None, + promote_attrs: bool = False, + ) -> Dataset: """Convert a DataArray to a Dataset. Parameters @@ -486,6 +493,8 @@ def to_dataset(self, dim: Hashable = None, *, name: Hashable = None) -> Dataset: name : hashable, optional Name to substitute for this array's name. Only valid if ``dim`` is not provided. + promote_attrs : bool, default False + Set to True to shallow copy attrs of DataArray to returned Dataset. Returns ------- @@ -499,9 +508,14 @@ def to_dataset(self, dim: Hashable = None, *, name: Hashable = None) -> Dataset: if dim is not None: if name is not None: raise TypeError("cannot supply both dim and name arguments") - return self._to_dataset_split(dim) + result = self._to_dataset_split(dim) else: - return self._to_dataset_whole(name) + result = self._to_dataset_whole(name) + + if promote_attrs: + result.attrs = dict(self.attrs) + + return result @property def name(self) -> Optional[Hashable]: @@ -876,8 +890,7 @@ def copy(self, deep: bool = True, data: Any = None) -> "DataArray": Shallow versus deep copy - >>> array = xr.DataArray([1, 2, 3], dims='x', - ... coords={'x': ['a', 'b', 'c']}) + >>> array = xr.DataArray([1, 2, 3], dims="x", coords={"x": ["a", "b", "c"]}) >>> array.copy() array([1, 2, 3]) @@ -1345,7 +1358,7 @@ def interp( Examples -------- - >>> da = xr.DataArray([1, 3], [('x', np.arange(2))]) + >>> da = xr.DataArray([1, 3], [("x", np.arange(2))]) >>> da.interp(x=0.5) array(2.0) @@ -1476,20 +1489,24 @@ def swap_dims(self, dims_dict: Mapping[Hashable, Hashable]) -> "DataArray": Examples -------- - >>> arr = xr.DataArray(data=[0, 1], dims="x", - coords={"x": ["a", "b"], "y": ("x", [0, 1])}) + + >>> arr = xr.DataArray( + ... data=[0, 1], dims="x", coords={"x": ["a", "b"], "y": ("x", [0, 1])}, + ... ) >>> arr array([0, 1]) Coordinates: * x (x) >> arr.swap_dims({"x": "y"}) array([0, 1]) Coordinates: x (y) >> arr.swap_dims({"x": "z"}) array([0, 1]) @@ -1590,12 +1607,11 @@ def set_index( Examples -------- - >>> arr = xr.DataArray(data=np.ones((2, 3)), - ... dims=['x', 'y'], - ... coords={'x': - ... range(2), 'y': - ... range(3), 'a': ('x', [3, 4]) - ... }) + >>> arr = xr.DataArray( + ... data=np.ones((2, 3)), + ... dims=["x", "y"], + ... coords={"x": range(2), "y": range(3), "a": ("x", [3, 4])}, + ... ) >>> arr array([[1., 1., 1.], @@ -1604,7 +1620,7 @@ def set_index( * x (x) int64 0 1 * y (y) int64 0 1 2 a (x) int64 3 4 - >>> arr.set_index(x='a') + >>> arr.set_index(x="a") array([[1., 1., 1.], [1., 1., 1.]]) @@ -1706,7 +1722,9 @@ def stack( ---------- dimensions : Mapping of the form new_name=(dim1, dim2, ...) Names of new dimensions, and the existing dimensions that they - replace. + replace. An ellipsis (`...`) will be replaced by all unlisted dimensions. + Passing a list containing an ellipsis (`stacked_dim=[...]`) will stack over + all dimensions. **dimensions_kwargs: The keyword arguments form of ``dimensions``. One of dimensions or dimensions_kwargs must be provided. @@ -1719,8 +1737,10 @@ def stack( Examples -------- - >>> arr = DataArray(np.arange(6).reshape(2, 3), - ... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) + >>> arr = xr.DataArray( + ... np.arange(6).reshape(2, 3), + ... coords=[("x", ["a", "b"]), ("y", [0, 1, 2])], + ... ) >>> arr array([[0, 1, 2], @@ -1728,8 +1748,8 @@ def stack( Coordinates: * x (x) |S1 'a' 'b' * y (y) int64 0 1 2 - >>> stacked = arr.stack(z=('x', 'y')) - >>> stacked.indexes['z'] + >>> stacked = arr.stack(z=("x", "y")) + >>> stacked.indexes["z"] MultiIndex(levels=[['a', 'b'], [0, 1, 2]], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=['x', 'y']) @@ -1769,8 +1789,10 @@ def unstack( Examples -------- - >>> arr = DataArray(np.arange(6).reshape(2, 3), - ... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) + >>> arr = xr.DataArray( + ... np.arange(6).reshape(2, 3), + ... coords=[("x", ["a", "b"]), ("y", [0, 1, 2])], + ... ) >>> arr array([[0, 1, 2], @@ -1778,8 +1800,8 @@ def unstack( Coordinates: * x (x) |S1 'a' 'b' * y (y) int64 0 1 2 - >>> stacked = arr.stack(z=('x', 'y')) - >>> stacked.indexes['z'] + >>> stacked = arr.stack(z=("x", "y")) + >>> stacked.indexes["z"] MultiIndex(levels=[['a', 'b'], [0, 1, 2]], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=['x', 'y']) @@ -1818,9 +1840,11 @@ def to_unstacked_dataset(self, dim, level=0): Examples -------- >>> import xarray as xr - >>> arr = DataArray(np.arange(6).reshape(2, 3), - ... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) - >>> data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)}) + >>> arr = xr.DataArray( + ... np.arange(6).reshape(2, 3), + ... coords=[("x", ["a", "b"]), ("y", [0, 1, 2])], + ... ) + >>> data = xr.Dataset({"a": arr, "b": arr.isel(y=0)}) >>> data Dimensions: (x: 2, y: 3) @@ -1830,12 +1854,12 @@ def to_unstacked_dataset(self, dim, level=0): Data variables: a (x, y) int64 0 1 2 3 4 5 b (x) int64 0 3 - >>> stacked = data.to_stacked_array("z", ['y']) - >>> stacked.indexes['z'] + >>> stacked = data.to_stacked_array("z", ["y"]) + >>> stacked.indexes["z"] MultiIndex(levels=[['a', 'b'], [0, 1, 2]], labels=[[0, 0, 0, 1], [0, 1, 2, -1]], names=['variable', 'y']) - >>> roundtripped = stacked.to_unstacked_dataset(dim='z') + >>> roundtripped = stacked.to_unstacked_dataset(dim="z") >>> data.identical(roundtripped) True @@ -2244,20 +2268,14 @@ def to_pandas(self) -> Union["DataArray", pd.Series, pd.DataFrame]: * 0D -> `xarray.DataArray` * 1D -> `pandas.Series` * 2D -> `pandas.DataFrame` - * 3D -> `pandas.Panel` *(deprecated)* - Only works for arrays with 3 or fewer dimensions. + Only works for arrays with 2 or fewer dimensions. The DataArray constructor performs the inverse transformation. """ # TODO: consolidate the info about pandas constructors and the # attributes that correspond to their indexes into a separate module? - constructors = { - 0: lambda x: x, - 1: pd.Series, - 2: pd.DataFrame, - 3: pdcompat.Panel, - } + constructors = {0: lambda x: x, 1: pd.Series, 2: pd.DataFrame} try: constructor = constructors[self.ndim] except KeyError: @@ -2630,7 +2648,7 @@ def plot(self) -> _PlotMethods: """ Access plotting functions for DataArray's - >>> d = DataArray([[1, 2], [3, 4]]) + >>> d = xr.DataArray([[1, 2], [3, 4]]) For convenience just call this directly @@ -2693,15 +2711,21 @@ def diff(self, dim: Hashable, n: int = 1, label: Hashable = "upper") -> "DataArr difference : same type as caller The n-th order finite difference of this object. + .. note:: + + `n` matches numpy's behavior and is different from pandas' first + argument named `periods`. + + Examples -------- - >>> arr = xr.DataArray([5, 5, 6, 6], [[1, 2, 3, 4]], ['x']) - >>> arr.diff('x') + >>> arr = xr.DataArray([5, 5, 6, 6], [[1, 2, 3, 4]], ["x"]) + >>> arr.diff("x") array([0, 1, 0]) Coordinates: * x (x) int64 2 3 4 - >>> arr.diff('x', 2) + >>> arr.diff("x", 2) array([ 1, -1]) Coordinates: @@ -2751,7 +2775,7 @@ def shift( Examples -------- - >>> arr = xr.DataArray([5, 6, 7], dims='x') + >>> arr = xr.DataArray([5, 6, 7], dims="x") >>> arr.shift(x=1) array([ nan, 5., 6.]) @@ -2801,7 +2825,7 @@ def roll( Examples -------- - >>> arr = xr.DataArray([5, 6, 7], dims='x') + >>> arr = xr.DataArray([5, 6, 7], dims="x") >>> arr.roll(x=1) array([7, 5, 6]) @@ -2850,18 +2874,20 @@ def dot( -------- >>> da_vals = np.arange(6 * 5 * 4).reshape((6, 5, 4)) - >>> da = DataArray(da_vals, dims=['x', 'y', 'z']) + >>> da = xr.DataArray(da_vals, dims=["x", "y", "z"]) >>> dm_vals = np.arange(4) - >>> dm = DataArray(dm_vals, dims=['z']) + >>> dm = xr.DataArray(dm_vals, dims=["z"]) >>> dm.dims ('z') + >>> da.dims ('x', 'y', 'z') >>> dot_result = da.dot(dm) >>> dot_result.dims ('x', 'y') + """ if isinstance(other, Dataset): raise NotImplementedError( @@ -2910,9 +2936,11 @@ def sortby( Examples -------- - >>> da = xr.DataArray(np.random.rand(5), - ... coords=[pd.date_range('1/1/2000', periods=5)], - ... dims='time') + >>> da = xr.DataArray( + ... np.random.rand(5), + ... coords=[pd.date_range("1/1/2000", periods=5)], + ... dims="time", + ... ) >>> da array([ 0.965471, 0.615637, 0.26532 , 0.270962, 0.552878]) @@ -2934,6 +2962,7 @@ def quantile( dim: Union[Hashable, Sequence[Hashable], None] = None, interpolation: str = "linear", keep_attrs: bool = None, + skipna: bool = True, ) -> "DataArray": """Compute the qth quantile of the data along the specified dimension. @@ -2961,6 +2990,8 @@ def quantile( If True, the dataset's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + skipna : bool, optional + Whether to skip missing values when aggregating. Returns ------- @@ -2973,7 +3004,7 @@ def quantile( See Also -------- - numpy.nanquantile, pandas.Series.quantile, Dataset.quantile + numpy.nanquantile, numpy.quantile, pandas.Series.quantile, Dataset.quantile Examples -------- @@ -3010,7 +3041,11 @@ def quantile( """ ds = self._to_temp_dataset().quantile( - q, dim=dim, keep_attrs=keep_attrs, interpolation=interpolation + q, + dim=dim, + keep_attrs=keep_attrs, + interpolation=interpolation, + skipna=skipna, ) return self._from_temp_dataset(ds) @@ -3046,8 +3081,8 @@ def rank( Examples -------- - >>> arr = xr.DataArray([5, 6, 7], dims='x') - >>> arr.rank('x') + >>> arr = xr.DataArray([5, 6, 7], dims="x") + >>> arr.rank("x") array([ 1., 2., 3.]) Dimensions without coordinates: x @@ -3087,8 +3122,11 @@ def differentiate( Examples -------- - >>> da = xr.DataArray(np.arange(12).reshape(4, 3), dims=['x', 'y'], - ... coords={'x': [0, 0.1, 1.1, 1.2]}) + >>> da = xr.DataArray( + ... np.arange(12).reshape(4, 3), + ... dims=["x", "y"], + ... coords={"x": [0, 0.1, 1.1, 1.2]}, + ... ) >>> da array([[ 0, 1, 2], @@ -3099,7 +3137,7 @@ def differentiate( * x (x) float64 0.0 0.1 1.1 1.2 Dimensions without coordinates: y >>> - >>> da.differentiate('x') + >>> da.differentiate("x") array([[30. , 30. , 30. ], [27.545455, 27.545455, 27.545455], @@ -3141,8 +3179,11 @@ def integrate( Examples -------- - >>> da = xr.DataArray(np.arange(12).reshape(4, 3), dims=['x', 'y'], - ... coords={'x': [0, 0.1, 1.1, 1.2]}) + >>> da = xr.DataArray( + ... np.arange(12).reshape(4, 3), + ... dims=["x", "y"], + ... coords={"x": [0, 0.1, 1.1, 1.2]}, + ... ) >>> da array([[ 0, 1, 2], @@ -3153,7 +3194,7 @@ def integrate( * x (x) float64 0.0 0.1 1.1 1.2 Dimensions without coordinates: y >>> - >>> da.integrate('x') + >>> da.integrate("x") array([5.4, 6.6, 7.8]) Dimensions without coordinates: y @@ -3238,6 +3279,236 @@ def map_blocks( return map_blocks(func, self, args, kwargs, template) + def polyfit( + self, + dim: Hashable, + deg: int, + skipna: bool = None, + rcond: float = None, + w: Union[Hashable, Any] = None, + full: bool = False, + cov: bool = False, + ): + """ + Least squares polynomial fit. + + This replicates the behaviour of `numpy.polyfit` but differs by skipping + invalid values when `skipna = True`. + + Parameters + ---------- + dim : hashable + Coordinate along which to fit the polynomials. + deg : int + Degree of the fitting polynomial. + skipna : bool, optional + If True, removes all invalid values before fitting each 1D slices of the array. + Default is True if data is stored in a dask.array or if there is any + invalid values, False otherwise. + rcond : float, optional + Relative condition number to the fit. + w : Union[Hashable, Any], optional + Weights to apply to the y-coordinate of the sample points. + Can be an array-like object or the name of a coordinate in the dataset. + full : bool, optional + Whether to return the residuals, matrix rank and singular values in addition + to the coefficients. + cov : Union[bool, str], optional + Whether to return to the covariance matrix in addition to the coefficients. + The matrix is not scaled if `cov='unscaled'`. + + Returns + ------- + polyfit_results : Dataset + A single dataset which contains: + + polyfit_coefficients + The coefficients of the best fit. + polyfit_residuals + The residuals of the least-square computation (only included if `full=True`) + [dim]_matrix_rank + The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_value + The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + polyfit_covariance + The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + + See also + -------- + numpy.polyfit + """ + return self._to_temp_dataset().polyfit( + dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov + ) + + def pad( + self, + pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, + mode: str = "constant", + stat_length: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + constant_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + end_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + reflect_type: str = None, + **pad_width_kwargs: Any, + ) -> "DataArray": + """Pad this array along one or more dimensions. + + .. warning:: + This function is experimental and its behaviour is likely to change + especially regarding padding of dimension coordinates (or IndexVariables). + + When using one of the modes ("edge", "reflect", "symmetric", "wrap"), + coordinates will be padded with the same mode, otherwise coordinates + are padded using the "constant" mode with fill_value dtypes.NA. + + Parameters + ---------- + pad_width : Mapping with the form of {dim: (pad_before, pad_after)} + Number of values padded along each dimension. + {dim: pad} is a shortcut for pad_before = pad_after = pad + mode : str + One of the following string values (taken from numpy docs) + + 'constant' (default) + Pads with a constant value. + 'edge' + Pads with the edge values of array. + 'linear_ramp' + Pads with the linear ramp between end_value and the + array edge value. + 'maximum' + Pads with the maximum value of all or part of the + vector along each axis. + 'mean' + Pads with the mean value of all or part of the + vector along each axis. + 'median' + Pads with the median value of all or part of the + vector along each axis. + 'minimum' + Pads with the minimum value of all or part of the + vector along each axis. + 'reflect' + Pads with the reflection of the vector mirrored on + the first and last values of the vector along each + axis. + 'symmetric' + Pads with the reflection of the vector mirrored + along the edge of the array. + 'wrap' + Pads with the wrap of the vector along the axis. + The first values are used to pad the end and the + end values are used to pad the beginning. + stat_length : int, tuple or mapping of the form {dim: tuple} + Used in 'maximum', 'mean', 'median', and 'minimum'. Number of + values at edge of each axis used to calculate the statistic value. + {dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)} unique + statistic lengths along each dimension. + ((before, after),) yields same before and after statistic lengths + for each dimension. + (stat_length,) or int is a shortcut for before = after = statistic + length for all axes. + Default is ``None``, to use the entire axis. + constant_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'constant'. The values to set the padded values for each + axis. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + pad constants along each dimension. + ``((before, after),)`` yields same before and after constants for each + dimension. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all dimensions. + Default is 0. + end_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'linear_ramp'. The values used for the ending value of the + linear_ramp and that will form the edge of the padded array. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + end values along each dimension. + ``((before, after),)`` yields same before and after end values for each + axis. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all axes. + Default is 0. + reflect_type : {'even', 'odd'}, optional + Used in 'reflect', and 'symmetric'. The 'even' style is the + default with an unaltered reflection around the edge value. For + the 'odd' style, the extended part of the array is created by + subtracting the reflected values from two times the edge value. + **pad_width_kwargs: + The keyword arguments form of ``pad_width``. + One of ``pad_width`` or ``pad_width_kwargs`` must be provided. + + Returns + ------- + padded : DataArray + DataArray with the padded coordinates and data. + + See also + -------- + DataArray.shift, DataArray.roll, DataArray.bfill, DataArray.ffill, numpy.pad, dask.array.pad + + Notes + ----- + By default when ``mode="constant"`` and ``constant_values=None``, integer types will be + promoted to ``float`` and padded with ``np.nan``. To avoid type promotion + specify ``constant_values=np.nan`` + + Examples + -------- + + >>> arr = xr.DataArray([5, 6, 7], coords=[("x", [0,1,2])]) + >>> arr.pad(x=(1,2), constant_values=0) + + array([0, 5, 6, 7, 0, 0]) + Coordinates: + * x (x) float64 nan 0.0 1.0 2.0 nan nan + + >>> da = xr.DataArray([[0,1,2,3], [10,11,12,13]], + dims=["x", "y"], + coords={"x": [0,1], "y": [10, 20 ,30, 40], "z": ("x", [100, 200])} + ) + >>> da.pad(x=1) + + array([[nan, nan, nan, nan], + [ 0., 1., 2., 3.], + [10., 11., 12., 13.], + [nan, nan, nan, nan]]) + Coordinates: + * x (x) float64 nan 0.0 1.0 nan + * y (y) int64 10 20 30 40 + z (x) float64 nan 100.0 200.0 nan + >>> da.pad(x=1, constant_values=np.nan) + + array([[-9223372036854775808, -9223372036854775808, -9223372036854775808, + -9223372036854775808], + [ 0, 1, 2, + 3], + [ 10, 11, 12, + 13], + [-9223372036854775808, -9223372036854775808, -9223372036854775808, + -9223372036854775808]]) + Coordinates: + * x (x) float64 nan 0.0 1.0 nan + * y (y) int64 10 20 30 40 + z (x) float64 nan 100.0 200.0 nan + """ + ds = self._to_temp_dataset().pad( + pad_width=pad_width, + mode=mode, + stat_length=stat_length, + constant_values=constant_values, + end_values=end_values, + reflect_type=reflect_type, + **pad_width_kwargs, + ) + return self._from_temp_dataset(ds) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = property(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 11db1c7006b..c1b02aa7969 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -46,6 +46,7 @@ resample, rolling, utils, + weighted, ) from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align from .common import ( @@ -75,6 +76,7 @@ merge_coordinates_without_align, merge_data_and_coords, ) +from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs from .pycompat import dask_array_type from .utils import ( @@ -86,6 +88,7 @@ decode_numpy_dict_values, either_dict_or_kwargs, hashable, + infix_dims, is_dict_like, is_scalar, maybe_wrap_array, @@ -457,6 +460,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): _rolling_cls = rolling.DatasetRolling _coarsen_cls = rolling.DatasetCoarsen _resample_cls = resample.DatasetResample + _weighted_cls = weighted.DatasetWeighted def __init__( self, @@ -533,7 +537,7 @@ def __init__( if isinstance(coords, Dataset): coords = coords.variables - variables, coord_names, dims, indexes = merge_data_and_coords( + variables, coord_names, dims, indexes, _ = merge_data_and_coords( data_vars, coords, compat="broadcast_equals" ) @@ -1010,8 +1014,9 @@ def copy(self, deep: bool = False, data: Mapping = None) -> "Dataset": Shallow copy versus deep copy >>> da = xr.DataArray(np.random.randn(2, 3)) - >>> ds = xr.Dataset({'foo': da, 'bar': ('x', [-1, 2])}, - coords={'x': ['one', 'two']}) + >>> ds = xr.Dataset( + ... {"foo": da, "bar": ("x", [-1, 2])}, coords={"x": ["one", "two"]}, + ... ) >>> ds.copy() Dimensions: (dim_0: 2, dim_1: 3, x: 2) @@ -1021,8 +1026,9 @@ def copy(self, deep: bool = False, data: Mapping = None) -> "Dataset": Data variables: foo (dim_0, dim_1) float64 -0.8079 0.3897 -1.862 -0.6091 -1.051 -0.3003 bar (x) int64 -1 2 + >>> ds_0 = ds.copy(deep=False) - >>> ds_0['foo'][0, 0] = 7 + >>> ds_0["foo"][0, 0] = 7 >>> ds_0 Dimensions: (dim_0: 2, dim_1: 3, x: 2) @@ -1032,6 +1038,7 @@ def copy(self, deep: bool = False, data: Mapping = None) -> "Dataset": Data variables: foo (dim_0, dim_1) float64 7.0 0.3897 -1.862 -0.6091 -1.051 -0.3003 bar (x) int64 -1 2 + >>> ds Dimensions: (dim_0: 2, dim_1: 3, x: 2) @@ -1046,7 +1053,9 @@ def copy(self, deep: bool = False, data: Mapping = None) -> "Dataset": structure of the original object, but with the new data. Original object is unaffected. - >>> ds.copy(data={'foo': np.arange(6).reshape(2, 3), 'bar': ['a', 'b']}) + >>> ds.copy( + ... data={"foo": np.arange(6).reshape(2, 3), "bar": ["a", "b"]} + ... ) Dimensions: (dim_0: 2, dim_1: 3, x: 2) Coordinates: @@ -1055,6 +1064,7 @@ def copy(self, deep: bool = False, data: Mapping = None) -> "Dataset": Data variables: foo (dim_0, dim_1) int64 0 1 2 3 4 5 bar (x) >> ds Dimensions: (dim_0: 2, dim_1: 3, x: 2) @@ -2355,9 +2365,10 @@ def reindex( >>> x = xr.Dataset( ... { ... "temperature": ("station", 20 * np.random.rand(4)), - ... "pressure": ("station", 500 * np.random.rand(4)) + ... "pressure": ("station", 500 * np.random.rand(4)), ... }, - ... coords={"station": ["boston", "nyc", "seattle", "denver"]}) + ... coords={"station": ["boston", "nyc", "seattle", "denver"]}, + ... ) >>> x Dimensions: (station: 4) @@ -2372,8 +2383,8 @@ def reindex( Create a new index and reindex the dataset. By default values in the new index that do not have corresponding records in the dataset are assigned `NaN`. - >>> new_index = ['boston', 'austin', 'seattle', 'lincoln'] - >>> x.reindex({'station': new_index}) + >>> new_index = ["boston", "austin", "seattle", "lincoln"] + >>> x.reindex({"station": new_index}) Dimensions: (station: 4) Coordinates: @@ -2384,7 +2395,7 @@ def reindex( We can fill in the missing values by passing a value to the keyword `fill_value`. - >>> x.reindex({'station': new_index}, fill_value=0) + >>> x.reindex({"station": new_index}, fill_value=0) Dimensions: (station: 4) Coordinates: @@ -2396,7 +2407,7 @@ def reindex( Because the index is not monotonically increasing or decreasing, we cannot use arguments to the keyword method to fill the `NaN` values. - >>> x.reindex({'station': new_index}, method='nearest') + >>> x.reindex({"station": new_index}, method="nearest") Traceback (most recent call last): ... raise ValueError('index must be monotonic increasing or decreasing') @@ -2407,10 +2418,14 @@ def reindex( >>> x2 = xr.Dataset( ... { - ... "temperature": ("time", [15.57, 12.77, np.nan, 0.3081, 16.59, 15.12]), - ... "pressure": ("time", 500 * np.random.rand(6)) + ... "temperature": ( + ... "time", + ... [15.57, 12.77, np.nan, 0.3081, 16.59, 15.12], + ... ), + ... "pressure": ("time", 500 * np.random.rand(6)), ... }, - ... coords={"time": pd.date_range('01/01/2019', periods=6, freq='D')}) + ... coords={"time": pd.date_range("01/01/2019", periods=6, freq="D")}, + ... ) >>> x2 Dimensions: (time: 6) @@ -2422,8 +2437,8 @@ def reindex( Suppose we decide to expand the dataset to cover a wider date range. - >>> time_index2 = pd.date_range('12/29/2018', periods=10, freq='D') - >>> x2.reindex({'time': time_index2}) + >>> time_index2 = pd.date_range("12/29/2018", periods=10, freq="D") + >>> x2.reindex({"time": time_index2}) Dimensions: (time: 10) Coordinates: @@ -2438,7 +2453,7 @@ def reindex( For example, to back-propagate the last valid value to fill the `NaN` values, pass `bfill` as an argument to the `method` keyword. - >>> x3 = x2.reindex({'time': time_index2}, method='bfill') + >>> x3 = x2.reindex({"time": time_index2}, method="bfill") >>> x3 Dimensions: (time: 10) @@ -2571,6 +2586,17 @@ def interp( coords = either_dict_or_kwargs(coords, coords_kwargs, "interp") indexers = dict(self._validate_interp_indexers(coords)) + if coords: + # This avoids broadcasting over coordinates that are both in + # the original array AND in the indexing array. It essentially + # forces interpolation along the shared coordinates. + sdims = ( + set(self.dims) + .intersection(*[set(nx.dims) for nx in indexers.values()]) + .difference(coords.keys()) + ) + indexers.update({d: self.variables[d] for d in sdims}) + obj = self if assume_sorted else self.sortby([k for k in coords]) def maybe_variable(obj, k): @@ -2882,8 +2908,10 @@ def swap_dims( Examples -------- - >>> ds = xr.Dataset(data_vars={"a": ("x", [5, 7]), "b": ("x", [0.1, 2.4])}, - coords={"x": ["a", "b"], "y": ("x", [0, 1])}) + >>> ds = xr.Dataset( + ... data_vars={"a": ("x", [5, 7]), "b": ("x", [0.1, 2.4])}, + ... coords={"x": ["a", "b"], "y": ("x", [0, 1])}, + ... ) >>> ds Dimensions: (x: 2) @@ -2893,6 +2921,7 @@ def swap_dims( Data variables: a (x) int64 5 7 b (x) float64 0.1 2.4 + >>> ds.swap_dims({"x": "y"}) Dimensions: (y: 2) @@ -2902,6 +2931,7 @@ def swap_dims( Data variables: a (y) int64 5 7 b (y) float64 0.1 2.4 + >>> ds.swap_dims({"x": "z"}) Dimensions: (z: 2) @@ -3122,13 +3152,12 @@ def set_index( Examples -------- - >>> arr = xr.DataArray(data=np.ones((2, 3)), - ... dims=['x', 'y'], - ... coords={'x': - ... range(2), 'y': - ... range(3), 'a': ('x', [3, 4]) - ... }) - >>> ds = xr.Dataset({'v': arr}) + >>> arr = xr.DataArray( + ... data=np.ones((2, 3)), + ... dims=["x", "y"], + ... coords={"x": range(2), "y": range(3), "a": ("x", [3, 4])}, + ... ) + >>> ds = xr.Dataset({"v": arr}) >>> ds Dimensions: (x: 2, y: 3) @@ -3138,7 +3167,7 @@ def set_index( a (x) int64 3 4 Data variables: v (x, y) float64 1.0 1.0 1.0 1.0 1.0 1.0 - >>> ds.set_index(x='a') + >>> ds.set_index(x="a") Dimensions: (x: 2, y: 3) Coordinates: @@ -3235,6 +3264,8 @@ def reorder_levels( return self._replace(variables, indexes=indexes) def _stack_once(self, dims, new_dim): + if ... in dims: + dims = list(infix_dims(dims, self.dims)) variables = {} for name, var in self.variables.items(): if name not in dims: @@ -3277,7 +3308,9 @@ def stack( ---------- dimensions : Mapping of the form new_name=(dim1, dim2, ...) Names of new dimensions, and the existing dimensions that they - replace. + replace. An ellipsis (`...`) will be replaced by all unlisted dimensions. + Passing a list containing an ellipsis (`stacked_dim=[...]`) will stack over + all dimensions. **dimensions_kwargs: The keyword arguments form of ``dimensions``. One of dimensions or dimensions_kwargs must be provided. @@ -3341,10 +3374,12 @@ def to_stacked_array( Examples -------- - >>> data = Dataset( - ... data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]), - ... 'b': ('x', [6, 7])}, - ... coords={'y': ['u', 'v', 'w']} + >>> data = xr.Dataset( + ... data_vars={ + ... "a": (("x", "y"), [[0, 1, 2], [3, 4, 5]]), + ... "b": ("x", [6, 7]), + ... }, + ... coords={"y": ["u", "v", "w"]}, ... ) >>> data @@ -3357,7 +3392,7 @@ def to_stacked_array( a (x, y) int64 0 1 2 3 4 5 b (x) int64 6 7 - >>> data.to_stacked_array("z", sample_dims=['x']) + >>> data.to_stacked_array("z", sample_dims=["x"]) array([[0, 1, 2, 6], [3, 4, 5, 7]]) @@ -3728,9 +3763,9 @@ def drop_sel(self, labels=None, *, errors="raise", **labels_kwargs): Examples -------- >>> data = np.random.randn(2, 3) - >>> labels = ['a', 'b', 'c'] - >>> ds = xr.Dataset({'A': (['x', 'y'], data), 'y': labels}) - >>> ds.drop_sel(y=['a', 'c']) + >>> labels = ["a", "b", "c"] + >>> ds = xr.Dataset({"A": (["x", "y"], data), "y": labels}) + >>> ds.drop_sel(y=["a", "c"]) Dimensions: (x: 2, y: 1) Coordinates: @@ -3738,7 +3773,7 @@ def drop_sel(self, labels=None, *, errors="raise", **labels_kwargs): Dimensions without coordinates: x Data variables: A (x, y) float64 -0.3454 0.1734 - >>> ds.drop_sel(y='b') + >>> ds.drop_sel(y="b") Dimensions: (x: 2, y: 2) Coordinates: @@ -3943,9 +3978,10 @@ def fillna(self, value: Any) -> "Dataset": ... "A": ("x", [np.nan, 2, np.nan, 0]), ... "B": ("x", [3, 4, np.nan, 1]), ... "C": ("x", [np.nan, np.nan, np.nan, 5]), - ... "D": ("x", [np.nan, 3, np.nan, 4]) + ... "D": ("x", [np.nan, 3, np.nan, 4]), ... }, - ... coords={"x": [0, 1, 2, 3]}) + ... coords={"x": [0, 1, 2, 3]}, + ... ) >>> ds Dimensions: (x: 4) @@ -3972,7 +4008,7 @@ def fillna(self, value: Any) -> "Dataset": Replace all `NaN` elements in column ‘A’, ‘B’, ‘C’, and ‘D’, with 0, 1, 2, and 3 respectively. - >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} + >>> values = {"A": 0, "B": 1, "C": 2, "D": 3} >>> ds.fillna(value=values) Dimensions: (x: 4) @@ -4279,7 +4315,7 @@ def map( Examples -------- >>> da = xr.DataArray(np.random.randn(2, 3)) - >>> ds = xr.Dataset({'foo': da, 'bar': ('x', [-1, 2])}) + >>> ds = xr.Dataset({"foo": da, "bar": ("x", [-1, 2])}) >>> ds Dimensions: (dim_0: 2, dim_1: 3, x: 2) @@ -4362,11 +4398,12 @@ def assign( Examples -------- - >>> import numpy as np - >>> import xarray as xr >>> x = xr.Dataset( ... { - ... "temperature_c": (("lat", "lon"), 20 * np.random.rand(4).reshape(2, 2)), + ... "temperature_c": ( + ... ("lat", "lon"), + ... 20 * np.random.rand(4).reshape(2, 2), + ... ), ... "precipitation": (("lat", "lon"), np.random.rand(4).reshape(2, 2)), ... }, ... coords={"lat": [10, 20], "lon": [150, 160]}, @@ -4383,7 +4420,7 @@ def assign( Where the value is a callable, evaluated on dataset: - >>> x.assign(temperature_f = lambda x: x.temperature_c * 9 / 5 + 32) + >>> x.assign(temperature_f=lambda x: x.temperature_c * 9 / 5 + 32) Dimensions: (lat: 2, lon: 2) Coordinates: @@ -4879,17 +4916,22 @@ def diff(self, dim, n=1, label="upper"): difference : same type as caller The n-th order finite difference of this object. + .. note:: + + `n` matches numpy's behavior and is different from pandas' first + argument named `periods`. + Examples -------- - >>> ds = xr.Dataset({'foo': ('x', [5, 5, 6, 6])}) - >>> ds.diff('x') + >>> ds = xr.Dataset({"foo": ("x", [5, 5, 6, 6])}) + >>> ds.diff("x") Dimensions: (x: 3) Coordinates: * x (x) int64 1 2 3 Data variables: foo (x) int64 0 1 0 - >>> ds.diff('x', 2) + >>> ds.diff("x", 2) Dimensions: (x: 2) Coordinates: @@ -4973,7 +5015,7 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs): Examples -------- - >>> ds = xr.Dataset({'foo': ('x', list('abcde'))}) + >>> ds = xr.Dataset({"foo": ("x", list("abcde"))}) >>> ds.shift(x=2) Dimensions: (x: 5) @@ -5032,7 +5074,7 @@ def roll(self, shifts=None, roll_coords=None, **shifts_kwargs): Examples -------- - >>> ds = xr.Dataset({'foo': ('x', list('abcde'))}) + >>> ds = xr.Dataset({"foo": ("x", list("abcde"))}) >>> ds.roll(x=2) Dimensions: (x: 5) @@ -5135,7 +5177,13 @@ def sortby(self, variables, ascending=True): return aligned_self.isel(**indices) def quantile( - self, q, dim=None, interpolation="linear", numeric_only=False, keep_attrs=None + self, + q, + dim=None, + interpolation="linear", + numeric_only=False, + keep_attrs=None, + skipna=True, ): """Compute the qth quantile of the data along the specified dimension. @@ -5166,6 +5214,8 @@ def quantile( object will be returned without attributes. numeric_only : bool, optional If True, only apply ``func`` to variables with a numeric dtype. + skipna : bool, optional + Whether to skip missing values when aggregating. Returns ------- @@ -5178,7 +5228,7 @@ def quantile( See Also -------- - numpy.nanquantile, pandas.Series.quantile, DataArray.quantile + numpy.nanquantile, numpy.quantile, pandas.Series.quantile, DataArray.quantile Examples -------- @@ -5253,6 +5303,7 @@ def quantile( dim=reduce_dims, interpolation=interpolation, keep_attrs=keep_attrs, + skipna=skipna, ) else: @@ -5536,19 +5587,23 @@ def filter_by_attrs(self, **kwargs): >>> precip = 10 * np.random.rand(2, 2, 3) >>> lon = [[-99.83, -99.32], [-99.79, -99.23]] >>> lat = [[42.25, 42.21], [42.63, 42.59]] - >>> dims = ['x', 'y', 'time'] - >>> temp_attr = dict(standard_name='air_potential_temperature') - >>> precip_attr = dict(standard_name='convective_precipitation_flux') - >>> ds = xr.Dataset({ - ... 'temperature': (dims, temp, temp_attr), - ... 'precipitation': (dims, precip, precip_attr)}, - ... coords={ - ... 'lon': (['x', 'y'], lon), - ... 'lat': (['x', 'y'], lat), - ... 'time': pd.date_range('2014-09-06', periods=3), - ... 'reference_time': pd.Timestamp('2014-09-05')}) + >>> dims = ["x", "y", "time"] + >>> temp_attr = dict(standard_name="air_potential_temperature") + >>> precip_attr = dict(standard_name="convective_precipitation_flux") + >>> ds = xr.Dataset( + ... { + ... "temperature": (dims, temp, temp_attr), + ... "precipitation": (dims, precip, precip_attr), + ... }, + ... coords={ + ... "lon": (["x", "y"], lon), + ... "lat": (["x", "y"], lat), + ... "time": pd.date_range("2014-09-06", periods=3), + ... "reference_time": pd.Timestamp("2014-09-05"), + ... }, + ... ) >>> # Get variables matching a specific standard_name. - >>> ds.filter_by_attrs(standard_name='convective_precipitation_flux') + >>> ds.filter_by_attrs(standard_name="convective_precipitation_flux") Dimensions: (time: 3, x: 2, y: 2) Coordinates: @@ -5698,5 +5753,349 @@ def map_blocks( return map_blocks(func, self, args, kwargs, template) + def polyfit( + self, + dim: Hashable, + deg: int, + skipna: bool = None, + rcond: float = None, + w: Union[Hashable, Any] = None, + full: bool = False, + cov: Union[bool, str] = False, + ): + """ + Least squares polynomial fit. + + This replicates the behaviour of `numpy.polyfit` but differs by skipping + invalid values when `skipna = True`. + + Parameters + ---------- + dim : hashable + Coordinate along which to fit the polynomials. + deg : int + Degree of the fitting polynomial. + skipna : bool, optional + If True, removes all invalid values before fitting each 1D slices of the array. + Default is True if data is stored in a dask.array or if there is any + invalid values, False otherwise. + rcond : float, optional + Relative condition number to the fit. + w : Union[Hashable, Any], optional + Weights to apply to the y-coordinate of the sample points. + Can be an array-like object or the name of a coordinate in the dataset. + full : bool, optional + Whether to return the residuals, matrix rank and singular values in addition + to the coefficients. + cov : Union[bool, str], optional + Whether to return to the covariance matrix in addition to the coefficients. + The matrix is not scaled if `cov='unscaled'`. + + + Returns + ------- + polyfit_results : Dataset + A single dataset which contains (for each "var" in the input dataset): + + [var]_polyfit_coefficients + The coefficients of the best fit for each variable in this dataset. + [var]_polyfit_residuals + The residuals of the least-square computation for each variable (only included if `full=True`) + [dim]_matrix_rank + The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_values + The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [var]_polyfit_covariance + The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + + See also + -------- + numpy.polyfit + """ + variables = {} + skipna_da = skipna + + x = get_clean_interp_index(self, dim) + xname = "{}_".format(self[dim].name) + order = int(deg) + 1 + lhs = np.vander(x, order) + + if rcond is None: + rcond = x.shape[0] * np.core.finfo(x.dtype).eps + + # Weights: + if w is not None: + if isinstance(w, Hashable): + w = self.coords[w] + w = np.asarray(w) + if w.ndim != 1: + raise TypeError("Expected a 1-d array for weights.") + if w.shape[0] != lhs.shape[0]: + raise TypeError("Expected w and {} to have the same length".format(dim)) + lhs *= w[:, np.newaxis] + + # Scaling + scale = np.sqrt((lhs * lhs).sum(axis=0)) + lhs /= scale + + degree_dim = utils.get_temp_dimname(self.dims, "degree") + + rank = np.linalg.matrix_rank(lhs) + if rank != order and not full: + warnings.warn( + "Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4 + ) + + if full: + rank = xr.DataArray(rank, name=xname + "matrix_rank") + variables[rank.name] = rank + sing = np.linalg.svd(lhs, compute_uv=False) + sing = xr.DataArray( + sing, + dims=(degree_dim,), + coords={degree_dim: np.arange(order)[::-1]}, + name=xname + "singular_values", + ) + variables[sing.name] = sing + + for name, da in self.data_vars.items(): + if dim not in da.dims: + continue + + if skipna is None: + if isinstance(da.data, dask_array_type): + skipna_da = True + else: + skipna_da = np.any(da.isnull()) + + dims_to_stack = [dimname for dimname in da.dims if dimname != dim] + stacked_coords = {} + if dims_to_stack: + stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") + rhs = da.transpose(dim, *dims_to_stack).stack( + {stacked_dim: dims_to_stack} + ) + stacked_coords = {stacked_dim: rhs[stacked_dim]} + scale_da = scale[:, np.newaxis] + else: + rhs = da + scale_da = scale + + if w is not None: + rhs *= w[:, np.newaxis] + + coeffs, residuals = duck_array_ops.least_squares( + lhs, rhs.data, rcond=rcond, skipna=skipna_da + ) + + if isinstance(name, str): + name = "{}_".format(name) + else: + # Thus a ReprObject => polyfit was called on a DataArray + name = "" + + coeffs = xr.DataArray( + coeffs / scale_da, + dims=[degree_dim] + list(stacked_coords.keys()), + coords={degree_dim: np.arange(order)[::-1], **stacked_coords}, + name=name + "polyfit_coefficients", + ) + if dims_to_stack: + coeffs = coeffs.unstack(stacked_dim) + variables[coeffs.name] = coeffs + + if full or (cov is True): + residuals = xr.DataArray( + residuals if dims_to_stack else residuals.squeeze(), + dims=list(stacked_coords.keys()), + coords=stacked_coords, + name=name + "polyfit_residuals", + ) + if dims_to_stack: + residuals = residuals.unstack(stacked_dim) + variables[residuals.name] = residuals + + if cov: + Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) + Vbase /= np.outer(scale, scale) + if cov == "unscaled": + fac = 1 + else: + if x.shape[0] <= order: + raise ValueError( + "The number of data points must exceed order to scale the covariance matrix." + ) + fac = residuals / (x.shape[0] - order) + covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j")) * fac + variables[name + "polyfit_covariance"] = covariance + + return Dataset(data_vars=variables, attrs=self.attrs.copy()) + + def pad( + self, + pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, + mode: str = "constant", + stat_length: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + constant_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + end_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + reflect_type: str = None, + **pad_width_kwargs: Any, + ) -> "Dataset": + """Pad this dataset along one or more dimensions. + + .. warning:: + This function is experimental and its behaviour is likely to change + especially regarding padding of dimension coordinates (or IndexVariables). + + When using one of the modes ("edge", "reflect", "symmetric", "wrap"), + coordinates will be padded with the same mode, otherwise coordinates + are padded using the "constant" mode with fill_value dtypes.NA. + + Parameters + ---------- + pad_width : Mapping with the form of {dim: (pad_before, pad_after)} + Number of values padded along each dimension. + {dim: pad} is a shortcut for pad_before = pad_after = pad + mode : str + One of the following string values (taken from numpy docs). + + 'constant' (default) + Pads with a constant value. + 'edge' + Pads with the edge values of array. + 'linear_ramp' + Pads with the linear ramp between end_value and the + array edge value. + 'maximum' + Pads with the maximum value of all or part of the + vector along each axis. + 'mean' + Pads with the mean value of all or part of the + vector along each axis. + 'median' + Pads with the median value of all or part of the + vector along each axis. + 'minimum' + Pads with the minimum value of all or part of the + vector along each axis. + 'reflect' + Pads with the reflection of the vector mirrored on + the first and last values of the vector along each + axis. + 'symmetric' + Pads with the reflection of the vector mirrored + along the edge of the array. + 'wrap' + Pads with the wrap of the vector along the axis. + The first values are used to pad the end and the + end values are used to pad the beginning. + stat_length : int, tuple or mapping of the form {dim: tuple} + Used in 'maximum', 'mean', 'median', and 'minimum'. Number of + values at edge of each axis used to calculate the statistic value. + {dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)} unique + statistic lengths along each dimension. + ((before, after),) yields same before and after statistic lengths + for each dimension. + (stat_length,) or int is a shortcut for before = after = statistic + length for all axes. + Default is ``None``, to use the entire axis. + constant_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'constant'. The values to set the padded values for each + axis. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + pad constants along each dimension. + ``((before, after),)`` yields same before and after constants for each + dimension. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all dimensions. + Default is 0. + end_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'linear_ramp'. The values used for the ending value of the + linear_ramp and that will form the edge of the padded array. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + end values along each dimension. + ``((before, after),)`` yields same before and after end values for each + axis. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all axes. + Default is 0. + reflect_type : {'even', 'odd'}, optional + Used in 'reflect', and 'symmetric'. The 'even' style is the + default with an unaltered reflection around the edge value. For + the 'odd' style, the extended part of the array is created by + subtracting the reflected values from two times the edge value. + **pad_width_kwargs: + The keyword arguments form of ``pad_width``. + One of ``pad_width`` or ``pad_width_kwargs`` must be provided. + + Returns + ------- + padded : Dataset + Dataset with the padded coordinates and data. + + See also + -------- + Dataset.shift, Dataset.roll, Dataset.bfill, Dataset.ffill, numpy.pad, dask.array.pad + + Notes + ----- + By default when ``mode="constant"`` and ``constant_values=None``, integer types will be + promoted to ``float`` and padded with ``np.nan``. To avoid type promotion + specify ``constant_values=np.nan`` + + Examples + -------- + + >>> ds = xr.Dataset({'foo': ('x', range(5))}) + >>> ds.pad(x=(1,2)) + + Dimensions: (x: 8) + Dimensions without coordinates: x + Data variables: + foo (x) float64 nan 0.0 1.0 2.0 3.0 4.0 nan nan + """ + pad_width = either_dict_or_kwargs(pad_width, pad_width_kwargs, "pad") + + if mode in ("edge", "reflect", "symmetric", "wrap"): + coord_pad_mode = mode + coord_pad_options = { + "stat_length": stat_length, + "constant_values": constant_values, + "end_values": end_values, + "reflect_type": reflect_type, + } + else: + coord_pad_mode = "constant" + coord_pad_options = {} + + variables = {} + for name, var in self.variables.items(): + var_pad_width = {k: v for k, v in pad_width.items() if k in var.dims} + if not var_pad_width: + variables[name] = var + elif name in self.data_vars: + variables[name] = var.pad( + pad_width=var_pad_width, + mode=mode, + stat_length=stat_length, + constant_values=constant_values, + end_values=end_values, + reflect_type=reflect_type, + ) + else: + variables[name] = var.pad( + pad_width=var_pad_width, + mode=coord_pad_mode, + **coord_pad_options, # type: ignore + ) + + return self._replace_vars_and_dims(variables) + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 6d0abe9a6fc..4047a1e68e1 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -114,7 +114,7 @@ def notnull(data): isin = _dask_or_eager_func("isin", array_args=slice(2)) take = _dask_or_eager_func("take") broadcast_to = _dask_or_eager_func("broadcast_to") -pad = _dask_or_eager_func("pad") +pad = _dask_or_eager_func("pad", dask_module=dask_array_compat) _concatenate = _dask_or_eager_func("concatenate", list_of_args=True) _stack = _dask_or_eager_func("stack", list_of_args=True) @@ -597,3 +597,12 @@ def rolling_window(array, axis, window, center, fill_value): return dask_array_ops.rolling_window(array, axis, window, center, fill_value) else: # np.ndarray return nputils.rolling_window(array, axis, window, center, fill_value) + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + """Return the coefficients and residuals of a least-squares fit. + """ + if isinstance(rhs, dask_array_type): + return dask_array_ops.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) + else: + return nputils.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) diff --git a/xarray/core/extensions.py b/xarray/core/extensions.py index 79abbccea39..e81070d18fd 100644 --- a/xarray/core/extensions.py +++ b/xarray/core/extensions.py @@ -110,8 +110,9 @@ def plot(self): Back in an interactive IPython session: - >>> ds = xarray.Dataset({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) + >>> ds = xarray.Dataset( + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... ) >>> ds.geo.center (5.0, 10.0) >>> ds.geo.plot() diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 89246ff228d..534d253ecc8 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -4,6 +4,7 @@ import functools from datetime import datetime, timedelta from itertools import zip_longest +from typing import Hashable import numpy as np import pandas as pd @@ -14,7 +15,7 @@ from .pycompat import dask_array_type, sparse_array_type -def pretty_print(x, numchars): +def pretty_print(x, numchars: int): """Given an object `x`, call `str(x)` and format the returned string so that it is numchars long, padding with trailing spaces or truncating with ellipses as necessary @@ -163,7 +164,7 @@ def format_items(x): return formatted -def format_array_flat(array, max_width): +def format_array_flat(array, max_width: int): """Return a formatted string for as many items in the flattened version of array that will fit within max_width characters. """ @@ -198,11 +199,20 @@ def format_array_flat(array, max_width): num_back = count - num_front # note that num_back is 0 <--> array.size is 0 or 1 # <--> relevant_back_items is [] - pprint_str = ( - " ".join(relevant_front_items[:num_front]) - + padding - + " ".join(relevant_back_items[-num_back:]) + pprint_str = "".join( + [ + " ".join(relevant_front_items[:num_front]), + padding, + " ".join(relevant_back_items[-num_back:]), + ] ) + + # As a final check, if it's still too long even with the limit in values, + # replace the end with an ellipsis + # NB: this will still returns a full 3-character ellipsis when max_width < 3 + if len(pprint_str) > max_width: + pprint_str = pprint_str[: max(max_width - 3, 0)] + "..." + return pprint_str @@ -258,10 +268,16 @@ def inline_variable_array_repr(var, max_width): return "..." -def summarize_variable(name, var, col_width, marker=" ", max_width=None): +def summarize_variable( + name: Hashable, var, col_width: int, marker: str = " ", max_width: int = None +): """Summarize a variable in one line, e.g., for the Dataset.__repr__.""" if max_width is None: - max_width = OPTIONS["display_width"] + max_width_options = OPTIONS["display_width"] + if not isinstance(max_width_options, int): + raise TypeError(f"`max_width` value of `{max_width}` is not a valid int") + else: + max_width = max_width_options first_col = pretty_print(f" {marker} {name} ", col_width) if var.dims: dims_str = "({}) ".format(", ".join(map(str, var.dims))) @@ -295,7 +311,7 @@ def summarize_datavar(name, var, col_width): return summarize_variable(name, var.variable, col_width) -def summarize_coord(name, var, col_width): +def summarize_coord(name: Hashable, var, col_width: int): is_index = name in var.dims marker = "*" if is_index else " " if is_index: diff --git a/xarray/core/formatting_html.py b/xarray/core/formatting_html.py index 8ceda8bfbfa..8678a58b381 100644 --- a/xarray/core/formatting_html.py +++ b/xarray/core/formatting_html.py @@ -95,7 +95,7 @@ def summarize_variable(name, var, is_index=False, dtype=None, preview=None): cssclass_idx = " class='xr-has-index'" if is_index else "" dims_str = f"({', '.join(escape(dim) for dim in var.dims)})" - name = escape(name) + name = escape(str(name)) dtype = dtype or escape(str(var.dtype)) # "unique" ids required to expand/collapse subsections diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index f2a9ebac6eb..67e8f0588b3 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -558,7 +558,9 @@ def fillna(self, value): out = ops.fillna(self, value) return out - def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): + def quantile( + self, q, dim=None, interpolation="linear", keep_attrs=None, skipna=True + ): """Compute the qth quantile over each array in the groups and concatenate them together into a new array. @@ -582,6 +584,8 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): * higher: ``j``. * nearest: ``i`` or ``j``, whichever is nearest. * midpoint: ``(i + j) / 2``. + skipna : bool, optional + Whether to skip missing values when aggregating. Returns ------- @@ -595,7 +599,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): See Also -------- - numpy.nanquantile, pandas.Series.quantile, Dataset.quantile, + numpy.nanquantile, numpy.quantile, pandas.Series.quantile, Dataset.quantile, DataArray.quantile Examples @@ -656,6 +660,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): dim=dim, interpolation=interpolation, keep_attrs=keep_attrs, + skipna=skipna, ) return out @@ -715,7 +720,7 @@ def assign_coords(self, coords=None, **coords_kwargs): def _maybe_reorder(xarray_obj, dim, positions): order = _inverse_permutation_indices(positions) - if order is None: + if order is None or len(order) != xarray_obj.sizes[dim]: return xarray_obj else: return xarray_obj[{dim: order}] @@ -833,7 +838,8 @@ def _combine(self, applied, restore_coord_dims=False, shortcut=False): if isinstance(combined, type(self._obj)): # only restore dimension order for arrays combined = self._restore_dim_order(combined) - if coord is not None: + # assign coord when the applied function does not return that coord + if coord is not None and dim not in applied_example.dims: if shortcut: coord_var = as_variable(coord) combined._coords[coord.name] = coord_var @@ -949,7 +955,8 @@ def _combine(self, applied): coord, dim, positions = self._infer_concat_args(applied_example) combined = concat(applied, dim) combined = _maybe_reorder(combined, dim, positions) - if coord is not None: + # assign coord when the applied function does not return that coord + if coord is not None and dim not in applied_example.dims: combined[coord.name] = coord combined = self._maybe_restore_empty_groups(combined) combined = self._maybe_unstack(combined) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 06bf08cefd2..dea1767d50c 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -22,6 +22,8 @@ def remove_unused_levels_categories(index): for i, level in enumerate(index.levels): if isinstance(level, pd.CategoricalIndex): level = level[index.codes[i]].remove_unused_categories() + else: + level = level[index.codes[i]] levels.append(level) index = pd.MultiIndex.from_arrays(levels, names=index.names) elif isinstance(index, pd.CategoricalIndex): diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 10c7804d718..fea94246471 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -20,7 +20,7 @@ from . import dtypes, pdcompat from .alignment import deep_align from .duck_array_ops import lazy_array_equiv -from .utils import Frozen, dict_equiv +from .utils import Frozen, compat_dict_union, dict_equiv from .variable import Variable, as_variable, assert_unique_multiindex_level_names if TYPE_CHECKING: @@ -491,17 +491,54 @@ def assert_valid_explicit_coords(variables, dims, explicit_coords): ) +def merge_attrs(variable_attrs, combine_attrs): + """Combine attributes from different variables according to combine_attrs + """ + if not variable_attrs: + # no attributes to merge + return None + + if combine_attrs == "drop": + return {} + elif combine_attrs == "override": + return variable_attrs[0] + elif combine_attrs == "no_conflicts": + result = dict(variable_attrs[0]) + for attrs in variable_attrs[1:]: + try: + result = compat_dict_union(result, attrs) + except ValueError: + raise MergeError( + "combine_attrs='no_conflicts', but some values are not " + "the same. Merging %s with %s" % (str(result), str(attrs)) + ) + return result + elif combine_attrs == "identical": + result = dict(variable_attrs[0]) + for attrs in variable_attrs[1:]: + if not dict_equiv(result, attrs): + raise MergeError( + "combine_attrs='identical', but attrs differ. First is %s " + ", other is %s." % (str(result), str(attrs)) + ) + return result + else: + raise ValueError("Unrecognised value for combine_attrs=%s" % combine_attrs) + + class _MergeResult(NamedTuple): variables: Dict[Hashable, Variable] coord_names: Set[Hashable] dims: Dict[Hashable, int] indexes: Dict[Hashable, pd.Index] + attrs: Dict[Hashable, Any] def merge_core( objects: Iterable["CoercibleMapping"], compat: str = "broadcast_equals", join: str = "outer", + combine_attrs: Optional[str] = "override", priority_arg: Optional[int] = None, explicit_coords: Optional[Sequence] = None, indexes: Optional[Mapping[Hashable, pd.Index]] = None, @@ -519,6 +556,8 @@ def merge_core( Compatibility checks to use when merging variables. join : {'outer', 'inner', 'left', 'right'}, optional How to combine objects with different indexes. + combine_attrs : {'drop', 'identical', 'no_conflicts', 'override'}, optional + How to combine attributes of objects priority_arg : integer, optional Optional argument in `objects` that takes precedence over the others. explicit_coords : set, optional @@ -536,12 +575,15 @@ def merge_core( Set of coordinate names. dims : dict Dictionary mapping from dimension names to sizes. + attrs : dict + Dictionary of attributes Raises ------ MergeError if the merge cannot be done successfully. """ - from .dataset import calculate_dimensions + from .dataarray import DataArray + from .dataset import Dataset, calculate_dimensions _assert_compat_valid(compat) @@ -571,7 +613,16 @@ def merge_core( "coordinates or not in the merged result: %s" % ambiguous_coords ) - return _MergeResult(variables, coord_names, dims, out_indexes) + attrs = merge_attrs( + [ + var.attrs + for var in coerced + if isinstance(var, Dataset) or isinstance(var, DataArray) + ], + combine_attrs, + ) + + return _MergeResult(variables, coord_names, dims, out_indexes, attrs) def merge( @@ -579,6 +630,7 @@ def merge( compat: str = "no_conflicts", join: str = "outer", fill_value: object = dtypes.NA, + combine_attrs: str = "drop", ) -> "Dataset": """Merge any number of xarray objects into a single Dataset as variables. @@ -614,6 +666,16 @@ def merge( dimension must have the same size in all objects. fill_value : scalar, optional Value to use for newly missing values + combine_attrs : {'drop', 'identical', 'no_conflicts', 'override'}, + default 'drop' + String indicating how to combine attrs of the objects being merged: + + - 'drop': empty attrs on returned Dataset. + - 'identical': all attrs must be the same on every object. + - 'no_conflicts': attrs from all objects are combined, any that have + the same name must also have the same value. + - 'override': skip comparing and copy attrs from the first dataset to + the result. Returns ------- @@ -678,7 +740,7 @@ def merge( var2 (lat, lon) float64 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], compat='identical') + >>> xr.merge([x, y, z], compat="identical") Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -690,7 +752,7 @@ def merge( var2 (lat, lon) float64 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], compat='equals') + >>> xr.merge([x, y, z], compat="equals") Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -702,7 +764,7 @@ def merge( var2 (lat, lon) float64 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], compat='equals', fill_value=-999.) + >>> xr.merge([x, y, z], compat="equals", fill_value=-999.0) Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -714,7 +776,7 @@ def merge( var2 (lat, lon) float64 5.0 -999.0 6.0 -999.0 ... -999.0 7.0 -999.0 8.0 var3 (time, lon) float64 0.0 -999.0 3.0 4.0 -999.0 9.0 - >>> xr.merge([x, y, z], join='override') + >>> xr.merge([x, y, z], join="override") Dimensions: (lat: 2, lon: 2, time: 2) Coordinates: @@ -726,7 +788,7 @@ def merge( var2 (lat, lon) float64 5.0 6.0 7.0 8.0 var3 (time, lon) float64 0.0 3.0 4.0 9.0 - >>> xr.merge([x, y, z], join='inner') + >>> xr.merge([x, y, z], join="inner") Dimensions: (lat: 1, lon: 1, time: 2) Coordinates: @@ -738,7 +800,7 @@ def merge( var2 (lat, lon) float64 5.0 var3 (time, lon) float64 0.0 4.0 - >>> xr.merge([x, y, z], compat='identical', join='inner') + >>> xr.merge([x, y, z], compat="identical", join="inner") Dimensions: (lat: 1, lon: 1, time: 2) Coordinates: @@ -750,7 +812,7 @@ def merge( var2 (lat, lon) float64 5.0 var3 (time, lon) float64 0.0 4.0 - >>> xr.merge([x, y, z], compat='broadcast_equals', join='outer') + >>> xr.merge([x, y, z], compat="broadcast_equals", join="outer") Dimensions: (lat: 3, lon: 3, time: 2) Coordinates: @@ -762,7 +824,7 @@ def merge( var2 (lat, lon) float64 5.0 nan 6.0 nan nan nan 7.0 nan 8.0 var3 (time, lon) float64 0.0 nan 3.0 4.0 nan 9.0 - >>> xr.merge([x, y, z], join='exact') + >>> xr.merge([x, y, z], join="exact") Traceback (most recent call last): ... ValueError: indexes along dimension 'lat' are not equal @@ -787,10 +849,16 @@ def merge( "Dataset(s), DataArray(s), and dictionaries." ) - obj = obj.to_dataset() if isinstance(obj, DataArray) else obj + obj = obj.to_dataset(promote_attrs=True) if isinstance(obj, DataArray) else obj dict_like_objects.append(obj) - merge_result = merge_core(dict_like_objects, compat, join, fill_value=fill_value) + merge_result = merge_core( + dict_like_objects, + compat, + join, + combine_attrs=combine_attrs, + fill_value=fill_value, + ) merged = Dataset._construct_direct(**merge_result._asdict()) return merged @@ -861,4 +929,9 @@ def dataset_update_method( if coord_names: other[key] = value.drop_vars(coord_names) - return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes) + return merge_core( + [dataset, other], + priority_arg=1, + indexes=dataset.indexes, + combine_attrs="override", + ) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index cf189e471cc..fa6df63e0ea 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -165,7 +165,7 @@ def _rolling_window(a, window, axis=-1): Examples -------- - >>> x=np.arange(10).reshape((2,5)) + >>> x = np.arange(10).reshape((2, 5)) >>> np.rolling_window(x, 3, axis=-1) array([[[0, 1, 2], [1, 2, 3], [2, 3, 4]], [[5, 6, 7], [6, 7, 8], [7, 8, 9]]]) @@ -220,6 +220,39 @@ def f(values, axis=None, **kwargs): return f +def _nanpolyfit_1d(arr, x, rcond=None): + out = np.full((x.shape[1] + 1,), np.nan) + mask = np.isnan(arr) + if not np.all(mask): + out[:-1], out[-1], _, _ = np.linalg.lstsq(x[~mask, :], arr[~mask], rcond=rcond) + return out + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + if skipna: + added_dim = rhs.ndim == 1 + if added_dim: + rhs = rhs.reshape(rhs.shape[0], 1) + nan_cols = np.any(np.isnan(rhs), axis=0) + out = np.empty((lhs.shape[1] + 1, rhs.shape[1])) + if np.any(nan_cols): + out[:, nan_cols] = np.apply_along_axis( + _nanpolyfit_1d, 0, rhs[:, nan_cols], lhs + ) + if np.any(~nan_cols): + out[:-1, ~nan_cols], out[-1, ~nan_cols], _, _ = np.linalg.lstsq( + lhs, rhs[:, ~nan_cols], rcond=rcond + ) + coeffs = out[:-1, :] + residuals = out[-1, :] + if added_dim: + coeffs = coeffs.reshape(coeffs.shape[0]) + residuals = residuals.reshape(residuals.shape[0]) + else: + coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs, rcond=rcond) + return coeffs, residuals + + nanmin = _create_bottleneck_method("nanmin") nanmax = _create_bottleneck_method("nanmax") nanmean = _create_bottleneck_method("nanmean") diff --git a/xarray/core/options.py b/xarray/core/options.py index 72f9ad8e1fa..5d81ca40a6e 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -20,7 +20,7 @@ CMAP_SEQUENTIAL: "viridis", CMAP_DIVERGENT: "RdBu_r", KEEP_ATTRS: "default", - DISPLAY_STYLE: "text", + DISPLAY_STYLE: "html", } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) @@ -108,7 +108,7 @@ class set_options: You can use ``set_options`` either as a context manager: - >>> ds = xr.Dataset({'x': np.arange(1000)}) + >>> ds = xr.Dataset({"x": np.arange(1000)}) >>> with xr.set_options(display_width=40): ... print(ds) diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index 29ac84c6df5..957ae569d87 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -192,18 +192,19 @@ def map_blocks( ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, its indices, and its methods like ``.groupby()``. - >>> def calculate_anomaly(da, groupby_type='time.month'): + >>> def calculate_anomaly(da, groupby_type="time.month"): ... # Necessary workaround to xarray's check with zero dimensions ... # https://github.com/pydata/xarray/issues/3575 ... if sum(da.shape) == 0: ... return da ... gb = da.groupby(groupby_type) - ... clim = gb.mean(dim='time') + ... clim = gb.mean(dim="time") ... return gb - clim - >>> time = xr.cftime_range('1990-01', '1992-01', freq='M') + >>> time = xr.cftime_range("1990-01", "1992-01", freq="M") >>> np.random.seed(123) - >>> array = xr.DataArray(np.random.rand(len(time)), - ... dims="time", coords=[time]).chunk() + >>> array = xr.DataArray( + ... np.random.rand(len(time)), dims="time", coords=[time] + ... ).chunk() >>> xr.map_blocks(calculate_anomaly, array).compute() array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, @@ -217,7 +218,9 @@ def map_blocks( Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments to the function being applied in ``xr.map_blocks()``: - >>> xr.map_blocks(calculate_anomaly, array, kwargs={'groupby_type': 'time.year'}) + >>> xr.map_blocks( + ... calculate_anomaly, array, kwargs={"groupby_type": "time.year"}, + ... ) array([ 0.15361741, -0.25671244, -0.31600032, 0.008463 , 0.1766172 , -0.11974531, 0.43791243, 0.14197797, -0.06191987, -0.15073425, @@ -458,6 +461,9 @@ def _wrapper(func, obj, to_array, args, kwargs, expected): var_chunks.append(output_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]),)) + elif dim in template.dims: + # new unindexed dimension + var_chunks.append((template.sizes[dim],)) data = dask.array.Array( hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index ea6d72b2e03..ecba5307680 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -7,6 +7,7 @@ from . import dtypes, duck_array_ops, utils from .dask_array_ops import dask_rolling_wrapper from .ops import inject_reduce_methods +from .options import _get_keep_attrs from .pycompat import dask_array_type try: @@ -42,10 +43,10 @@ class Rolling: DataArray.rolling """ - __slots__ = ("obj", "window", "min_periods", "center", "dim") - _attributes = ("window", "min_periods", "center", "dim") + __slots__ = ("obj", "window", "min_periods", "center", "dim", "keep_attrs") + _attributes = ("window", "min_periods", "center", "dim", "keep_attrs") - def __init__(self, obj, windows, min_periods=None, center=False): + def __init__(self, obj, windows, min_periods=None, center=False, keep_attrs=None): """ Moving window object. @@ -65,6 +66,10 @@ def __init__(self, obj, windows, min_periods=None, center=False): setting min_periods equal to the size of the window. center : boolean, default False Set the labels at the center of the window. + keep_attrs : bool, optional + If True, the object's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -89,6 +94,10 @@ def __init__(self, obj, windows, min_periods=None, center=False): self.center = center self.dim = dim + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + self.keep_attrs = keep_attrs + @property def _min_periods(self): return self.min_periods if self.min_periods is not None else self.window @@ -143,7 +152,7 @@ def count(self): class DataArrayRolling(Rolling): __slots__ = ("window_labels",) - def __init__(self, obj, windows, min_periods=None, center=False): + def __init__(self, obj, windows, min_periods=None, center=False, keep_attrs=None): """ Moving window object for DataArray. You should use DataArray.rolling() method to construct this object @@ -165,6 +174,10 @@ def __init__(self, obj, windows, min_periods=None, center=False): setting min_periods equal to the size of the window. center : boolean, default False Set the labels at the center of the window. + keep_attrs : bool, optional + If True, the object's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -177,7 +190,11 @@ def __init__(self, obj, windows, min_periods=None, center=False): Dataset.rolling Dataset.groupby """ - super().__init__(obj, windows, min_periods=min_periods, center=center) + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + super().__init__( + obj, windows, min_periods=min_periods, center=center, keep_attrs=keep_attrs + ) self.window_labels = self.obj[self.dim] @@ -214,21 +231,22 @@ def construct(self, window_dim, stride=1, fill_value=dtypes.NA): Examples -------- - >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) - >>> + >>> da = xr.DataArray(np.arange(8).reshape(2, 4), dims=("a", "b")) + >>> rolling = da.rolling(b=3) - >>> rolling.construct('window_dim') + >>> rolling.construct("window_dim") array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) Dimensions without coordinates: a, b, window_dim - >>> + >>> rolling = da.rolling(b=3, center=True) - >>> rolling.construct('window_dim') + >>> rolling.construct("window_dim") array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) Dimensions without coordinates: a, b, window_dim + """ from .dataarray import DataArray @@ -261,26 +279,26 @@ def reduce(self, func, **kwargs): Examples -------- - >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) - >>> + >>> da = xr.DataArray(np.arange(8).reshape(2, 4), dims=("a", "b")) >>> rolling = da.rolling(b=3) - >>> rolling.construct('window_dim') + >>> rolling.construct("window_dim") array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) Dimensions without coordinates: a, b, window_dim - >>> + >>> rolling.reduce(np.sum) array([[nan, nan, 3., 6.], [nan, nan, 15., 18.]]) Dimensions without coordinates: a, b - >>> + >>> rolling = da.rolling(b=3, min_periods=1) >>> rolling.reduce(np.nansum) array([[ 0., 1., 3., 6.], [ 4., 9., 15., 18.]]) + """ rolling_dim = utils.get_temp_dimname(self.obj.dims, "_rolling_dim") windows = self.construct(rolling_dim) @@ -331,7 +349,7 @@ def _bottleneck_reduce(self, func, **kwargs): else: shift = (-self.window // 2) + 1 valid = (slice(None),) * axis + (slice(-shift, None),) - padded = padded.pad_with_fill_value({self.dim: (0, -shift)}) + padded = padded.pad({self.dim: (0, -shift)}, mode="constant") if isinstance(padded.data, dask_array_type): raise AssertionError("should not be reachable") @@ -374,7 +392,7 @@ def _numpy_or_bottleneck_reduce( class DatasetRolling(Rolling): __slots__ = ("rollings",) - def __init__(self, obj, windows, min_periods=None, center=False): + def __init__(self, obj, windows, min_periods=None, center=False, keep_attrs=None): """ Moving window object for Dataset. You should use Dataset.rolling() method to construct this object @@ -396,6 +414,10 @@ def __init__(self, obj, windows, min_periods=None, center=False): setting min_periods equal to the size of the window. center : boolean, default False Set the labels at the center of the window. + keep_attrs : bool, optional + If True, the object's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -408,7 +430,7 @@ def __init__(self, obj, windows, min_periods=None, center=False): Dataset.groupby DataArray.groupby """ - super().__init__(obj, windows, min_periods, center) + super().__init__(obj, windows, min_periods, center, keep_attrs) if self.dim not in self.obj.dims: raise KeyError(self.dim) # Keep each Rolling object as a dictionary @@ -416,7 +438,9 @@ def __init__(self, obj, windows, min_periods=None, center=False): for key, da in self.obj.data_vars.items(): # keeps rollings only for the dataset depending on slf.dim if self.dim in da.dims: - self.rollings[key] = DataArrayRolling(da, windows, min_periods, center) + self.rollings[key] = DataArrayRolling( + da, windows, min_periods, center, keep_attrs + ) def _dataset_implementation(self, func, **kwargs): from .dataset import Dataset @@ -427,7 +451,8 @@ def _dataset_implementation(self, func, **kwargs): reduced[key] = func(self.rollings[key], **kwargs) else: reduced[key] = self.obj[key] - return Dataset(reduced, coords=self.obj.coords) + attrs = self.obj.attrs if self.keep_attrs else {} + return Dataset(reduced, coords=self.obj.coords, attrs=attrs) def reduce(self, func, **kwargs): """Reduce the items in this group by applying `func` along some @@ -466,7 +491,7 @@ def _numpy_or_bottleneck_reduce( **kwargs, ) - def construct(self, window_dim, stride=1, fill_value=dtypes.NA): + def construct(self, window_dim, stride=1, fill_value=dtypes.NA, keep_attrs=None): """ Convert this rolling object to xr.Dataset, where the window dimension is stacked as a new dimension @@ -487,6 +512,9 @@ def construct(self, window_dim, stride=1, fill_value=dtypes.NA): from .dataset import Dataset + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=True) + dataset = {} for key, da in self.obj.data_vars.items(): if self.dim in da.dims: @@ -509,10 +537,18 @@ class Coarsen: DataArray.coarsen """ - __slots__ = ("obj", "boundary", "coord_func", "windows", "side", "trim_excess") + __slots__ = ( + "obj", + "boundary", + "coord_func", + "windows", + "side", + "trim_excess", + "keep_attrs", + ) _attributes = ("windows", "side", "trim_excess") - def __init__(self, obj, windows, boundary, side, coord_func): + def __init__(self, obj, windows, boundary, side, coord_func, keep_attrs): """ Moving window object. @@ -541,6 +577,7 @@ def __init__(self, obj, windows, boundary, side, coord_func): self.windows = windows self.side = side self.boundary = boundary + self.keep_attrs = keep_attrs absent_dims = [dim for dim in windows.keys() if dim not in self.obj.dims] if absent_dims: @@ -626,6 +663,11 @@ def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool def wrapped_func(self, **kwargs): from .dataset import Dataset + if self.keep_attrs: + attrs = self.obj.attrs + else: + attrs = {} + reduced = {} for key, da in self.obj.data_vars.items(): reduced[key] = da.variable.coarsen( @@ -644,7 +686,7 @@ def wrapped_func(self, **kwargs): ) else: coords[c] = v.variable - return Dataset(reduced, coords=coords) + return Dataset(reduced, coords=coords, attrs=attrs) return wrapped_func diff --git a/xarray/core/rolling_exp.py b/xarray/core/rolling_exp.py index ac6768e8a9c..6ef63e42291 100644 --- a/xarray/core/rolling_exp.py +++ b/xarray/core/rolling_exp.py @@ -94,8 +94,8 @@ def mean(self): Examples -------- - >>> da = xr.DataArray([1,1,2,2,2], dims='x') - >>> da.rolling_exp(x=2, window_type='span').mean() + >>> da = xr.DataArray([1, 1, 2, 2, 2], dims="x") + >>> da.rolling_exp(x=2, window_type="span").mean() array([1. , 1. , 1.692308, 1.9 , 1.966942]) Dimensions without coordinates: x diff --git a/xarray/core/utils.py b/xarray/core/utils.py index e335365d5ca..896ee31ab5c 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -184,7 +184,7 @@ def peek_at(iterable: Iterable[T]) -> Tuple[T, Iterator[T]]: def update_safety_check( - first_dict: MutableMapping[K, V], + first_dict: Mapping[K, V], second_dict: Mapping[K, V], compat: Callable[[V, V], bool] = equivalent, ) -> None: @@ -343,7 +343,7 @@ def dict_equiv( return True -def ordered_dict_intersection( +def compat_dict_intersection( first_dict: Mapping[K, V], second_dict: Mapping[K, V], compat: Callable[[V, V], bool] = equivalent, @@ -371,6 +371,35 @@ def ordered_dict_intersection( return new_dict +def compat_dict_union( + first_dict: Mapping[K, V], + second_dict: Mapping[K, V], + compat: Callable[[V, V], bool] = equivalent, +) -> MutableMapping[K, V]: + """Return the union of two dictionaries as a new dictionary. + + An exception is raised if any keys are found in both dictionaries and the + values are not compatible. + + Parameters + ---------- + first_dict, second_dict : dict-like + Mappings to merge. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equivalence. + + Returns + ------- + union : dict + union of the contents. + """ + new_dict = dict(first_dict) + update_safety_check(first_dict, second_dict, compat) + new_dict.update(second_dict) + return new_dict + + class Frozen(Mapping[K, V]): """Wrapper around an object implementing the mapping interface to make it immutable. If you really want to modify the mapping, the mutable version is diff --git a/xarray/core/variable.py b/xarray/core/variable.py index daa8678157b..c9addeefb04 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,11 +1,12 @@ import copy import functools import itertools +import numbers import warnings from collections import defaultdict from datetime import timedelta from distutils.version import LooseVersion -from typing import Any, Dict, Hashable, Mapping, TypeVar, Union +from typing import Any, Dict, Hashable, Mapping, Tuple, TypeVar, Union import numpy as np import pandas as pd @@ -32,12 +33,6 @@ infix_dims, ) -try: - import dask.array as da -except ImportError: - pass - - NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( indexing.ExplicitlyIndexed, pd.Index, @@ -843,7 +838,7 @@ def copy(self, deep=True, data=None): Shallow copy versus deep copy - >>> var = xr.Variable(data=[1, 2, 3], dims='x') + >>> var = xr.Variable(data=[1, 2, 3], dims="x") >>> var.copy() array([1, 2, 3]) @@ -1150,66 +1145,114 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs): result = result._shift_one_dim(dim, count, fill_value=fill_value) return result - def pad_with_fill_value( - self, pad_widths=None, fill_value=dtypes.NA, **pad_widths_kwargs + def _pad_options_dim_to_index( + self, + pad_option: Mapping[Hashable, Union[int, Tuple[int, int]]], + fill_with_shape=False, + ): + if fill_with_shape: + return [ + (n, n) if d not in pad_option else pad_option[d] + for d, n in zip(self.dims, self.data.shape) + ] + return [(0, 0) if d not in pad_option else pad_option[d] for d in self.dims] + + def pad( + self, + pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, + mode: str = "constant", + stat_length: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + constant_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + end_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + reflect_type: str = None, + **pad_width_kwargs: Any, ): """ - Return a new Variable with paddings. + Return a new Variable with padded data. Parameters ---------- - pad_width: Mapping of the form {dim: (before, after)} - Number of values padded to the edges of each dimension. - **pad_widths_kwargs: - Keyword argument for pad_widths + pad_width: Mapping with the form of {dim: (pad_before, pad_after)} + Number of values padded along each dimension. + {dim: pad} is a shortcut for pad_before = pad_after = pad + mode: (str) + See numpy / Dask docs + stat_length : int, tuple or mapping of the form {dim: tuple} + Used in 'maximum', 'mean', 'median', and 'minimum'. Number of + values at edge of each axis used to calculate the statistic value. + constant_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'constant'. The values to set the padded values for each + axis. + end_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'linear_ramp'. The values used for the ending value of the + linear_ramp and that will form the edge of the padded array. + reflect_type : {'even', 'odd'}, optional + Used in 'reflect', and 'symmetric'. The 'even' style is the + default with an unaltered reflection around the edge value. For + the 'odd' style, the extended part of the array is created by + subtracting the reflected values from two times the edge value. + **pad_width_kwargs: + One of pad_width or pad_width_kwargs must be provided. + + Returns + ------- + padded : Variable + Variable with the same dimensions and attributes but padded data. """ - pad_widths = either_dict_or_kwargs(pad_widths, pad_widths_kwargs, "pad") + pad_width = either_dict_or_kwargs(pad_width, pad_width_kwargs, "pad") - if fill_value is dtypes.NA: - dtype, fill_value = dtypes.maybe_promote(self.dtype) + # change default behaviour of pad with mode constant + if mode == "constant" and ( + constant_values is None or constant_values is dtypes.NA + ): + dtype, constant_values = dtypes.maybe_promote(self.dtype) else: dtype = self.dtype - if isinstance(self.data, dask_array_type): - array = self.data - - # Dask does not yet support pad. We manually implement it. - # https://github.com/dask/dask/issues/1926 - for d, pad in pad_widths.items(): - axis = self.get_axis_num(d) - before_shape = list(array.shape) - before_shape[axis] = pad[0] - before_chunks = list(array.chunks) - before_chunks[axis] = (pad[0],) - after_shape = list(array.shape) - after_shape[axis] = pad[1] - after_chunks = list(array.chunks) - after_chunks[axis] = (pad[1],) - - arrays = [] - if pad[0] > 0: - arrays.append( - da.full( - before_shape, fill_value, dtype=dtype, chunks=before_chunks - ) - ) - arrays.append(array) - if pad[1] > 0: - arrays.append( - da.full( - after_shape, fill_value, dtype=dtype, chunks=after_chunks - ) - ) - if len(arrays) > 1: - array = da.concatenate(arrays, axis=axis) - else: - pads = [(0, 0) if d not in pad_widths else pad_widths[d] for d in self.dims] - array = np.pad( - self.data.astype(dtype, copy=False), - pads, - mode="constant", - constant_values=fill_value, + # create pad_options_kwargs, numpy requires only relevant kwargs to be nonempty + if isinstance(stat_length, dict): + stat_length = self._pad_options_dim_to_index( + stat_length, fill_with_shape=True ) + if isinstance(constant_values, dict): + constant_values = self._pad_options_dim_to_index(constant_values) + if isinstance(end_values, dict): + end_values = self._pad_options_dim_to_index(end_values) + + # workaround for bug in Dask's default value of stat_length https://github.com/dask/dask/issues/5303 + if stat_length is None and mode in ["maximum", "mean", "median", "minimum"]: + stat_length = [(n, n) for n in self.data.shape] # type: ignore + + # change integer values to a tuple of two of those values and change pad_width to index + for k, v in pad_width.items(): + if isinstance(v, numbers.Number): + pad_width[k] = (v, v) + pad_width_by_index = self._pad_options_dim_to_index(pad_width) + + # create pad_options_kwargs, numpy/dask requires only relevant kwargs to be nonempty + pad_option_kwargs = {} + if stat_length is not None: + pad_option_kwargs["stat_length"] = stat_length + if constant_values is not None: + pad_option_kwargs["constant_values"] = constant_values + if end_values is not None: + pad_option_kwargs["end_values"] = end_values + if reflect_type is not None: + pad_option_kwargs["reflect_type"] = reflect_type # type: ignore + + array = duck_array_ops.pad( + self.data.astype(dtype, copy=False), + pad_width_by_index, + mode=mode, + **pad_option_kwargs, + ) + return type(self)(self.dims, array) def _roll_one_dim(self, dim, count): @@ -1678,7 +1721,9 @@ def no_conflicts(self, other, equiv=duck_array_ops.array_notnull_equiv): """ return self.broadcast_equals(other, equiv=equiv) - def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): + def quantile( + self, q, dim=None, interpolation="linear", keep_attrs=None, skipna=True + ): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements. @@ -1725,6 +1770,8 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): from .computation import apply_ufunc + _quantile_func = np.nanquantile if skipna else np.quantile + if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) @@ -1739,7 +1786,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): def _wrapper(npa, **kwargs): # move quantile axis to end. required for apply_ufunc - return np.moveaxis(np.nanquantile(npa, **kwargs), 0, -1) + return np.moveaxis(_quantile_func(npa, **kwargs), 0, -1) axis = np.arange(-1, -1 * len(dim) - 1, -1) result = apply_ufunc( @@ -1840,13 +1887,13 @@ def rolling_window( Examples -------- - >>> v=Variable(('a', 'b'), np.arange(8).reshape((2,4))) - >>> v.rolling_window(x, 'b', 3, 'window_dim') + >>> v = Variable(("a", "b"), np.arange(8).reshape((2, 4))) + >>> v.rolling_window(x, "b", 3, "window_dim") array([[[nan, nan, 0], [nan, 0, 1], [0, 1, 2], [1, 2, 3]], [[nan, nan, 4], [nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) - >>> v.rolling_window(x, 'b', 3, 'window_dim', center=True) + >>> v.rolling_window(x, "b", 3, "window_dim", center=True) array([[[nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, nan]], [[nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, nan]]]) @@ -1926,10 +1973,10 @@ def _coarsen_reshape(self, windows, boundary, side): if pad < 0: pad += window if side[d] == "left": - pad_widths = {d: (0, pad)} + pad_width = {d: (0, pad)} else: - pad_widths = {d: (pad, 0)} - variable = variable.pad_with_fill_value(pad_widths) + pad_width = {d: (pad, 0)} + variable = variable.pad(pad_width, mode="constant") else: raise TypeError( "{} is invalid for boundary. Valid option is 'exact', " @@ -1949,6 +1996,9 @@ def _coarsen_reshape(self, windows, boundary, side): else: shape.append(variable.shape[i]) + keep_attrs = _get_keep_attrs(default=False) + variable.attrs = variable._attrs if keep_attrs else {} + return variable.data.reshape(shape), tuple(axes) @property @@ -2054,9 +2104,17 @@ def load(self): # https://github.com/python/mypy/issues/1465 @Variable.data.setter # type: ignore def data(self, data): - Variable.data.fset(self, data) - if not isinstance(self._data, PandasIndexAdapter): - self._data = PandasIndexAdapter(self._data) + raise ValueError( + f"Cannot assign to the .data attribute of dimension coordinate a.k.a IndexVariable {self.name!r}. " + f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate." + ) + + @Variable.values.setter # type: ignore + def values(self, values): + raise ValueError( + f"Cannot assign to the .values attribute of dimension coordinate a.k.a IndexVariable {self.name!r}. " + f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate." + ) def chunk(self, chunks=None, name=None, lock=False): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py new file mode 100644 index 00000000000..996d2e4c43e --- /dev/null +++ b/xarray/core/weighted.py @@ -0,0 +1,255 @@ +from typing import TYPE_CHECKING, Hashable, Iterable, Optional, Union, overload + +from .computation import dot +from .options import _get_keep_attrs + +if TYPE_CHECKING: + from .dataarray import DataArray, Dataset + +_WEIGHTED_REDUCE_DOCSTRING_TEMPLATE = """ + Reduce this {cls}'s data by a weighted ``{fcn}`` along some dimension(s). + + Parameters + ---------- + dim : str or sequence of str, optional + Dimension(s) over which to apply the weighted ``{fcn}``. + skipna : bool, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or skipna=True has not been + implemented (object, datetime64 or timedelta64). + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + + Returns + ------- + reduced : {cls} + New {cls} object with weighted ``{fcn}`` applied to its data and + the indicated dimension(s) removed. + + Notes + ----- + Returns {on_zero} if the ``weights`` sum to 0.0 along the reduced + dimension(s). + """ + +_SUM_OF_WEIGHTS_DOCSTRING = """ + Calculate the sum of weights, accounting for missing values in the data + + Parameters + ---------- + dim : str or sequence of str, optional + Dimension(s) over which to sum the weights. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + + Returns + ------- + reduced : {cls} + New {cls} object with the sum of the weights over the given dimension. + """ + + +class Weighted: + """An object that implements weighted operations. + + You should create a Weighted object by using the ``DataArray.weighted`` or + ``Dataset.weighted`` methods. + + See Also + -------- + Dataset.weighted + DataArray.weighted + """ + + __slots__ = ("obj", "weights") + + @overload + def __init__(self, obj: "DataArray", weights: "DataArray") -> None: + ... + + @overload # noqa: F811 + def __init__(self, obj: "Dataset", weights: "DataArray") -> None: # noqa: F811 + ... + + def __init__(self, obj, weights): # noqa: F811 + """ + Create a Weighted object + + Parameters + ---------- + obj : DataArray or Dataset + Object over which the weighted reduction operation is applied. + weights : DataArray + An array of weights associated with the values in the obj. + Each value in the obj contributes to the reduction operation + according to its associated weight. + + Notes + ----- + ``weights`` must be a ``DataArray`` and cannot contain missing values. + Missing values can be replaced by ``weights.fillna(0)``. + """ + + from .dataarray import DataArray + + if not isinstance(weights, DataArray): + raise ValueError("`weights` must be a DataArray") + + if weights.isnull().any(): + raise ValueError( + "`weights` cannot contain missing values. " + "Missing values can be replaced by `weights.fillna(0)`." + ) + + self.obj = obj + self.weights = weights + + @staticmethod + def _reduce( + da: "DataArray", + weights: "DataArray", + dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, + skipna: Optional[bool] = None, + ) -> "DataArray": + """reduce using dot; equivalent to (da * weights).sum(dim, skipna) + + for internal use only + """ + + # need to infer dims as we use `dot` + if dim is None: + dim = ... + + # need to mask invalid values in da, as `dot` does not implement skipna + if skipna or (skipna is None and da.dtype.kind in "cfO"): + da = da.fillna(0.0) + + # `dot` does not broadcast arrays, so this avoids creating a large + # DataArray (if `weights` has additional dimensions) + # maybe add fasttrack (`(da * weights).sum(dims=dim, skipna=skipna)`) + return dot(da, weights, dims=dim) + + def _sum_of_weights( + self, da: "DataArray", dim: Optional[Union[Hashable, Iterable[Hashable]]] = None + ) -> "DataArray": + """ Calculate the sum of weights, accounting for missing values """ + + # we need to mask data values that are nan; else the weights are wrong + mask = da.notnull() + + sum_of_weights = self._reduce(mask, self.weights, dim=dim, skipna=False) + + # 0-weights are not valid + valid_weights = sum_of_weights != 0.0 + + return sum_of_weights.where(valid_weights) + + def _weighted_sum( + self, + da: "DataArray", + dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, + skipna: Optional[bool] = None, + ) -> "DataArray": + """Reduce a DataArray by a by a weighted ``sum`` along some dimension(s).""" + + return self._reduce(da, self.weights, dim=dim, skipna=skipna) + + def _weighted_mean( + self, + da: "DataArray", + dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, + skipna: Optional[bool] = None, + ) -> "DataArray": + """Reduce a DataArray by a weighted ``mean`` along some dimension(s).""" + + weighted_sum = self._weighted_sum(da, dim=dim, skipna=skipna) + + sum_of_weights = self._sum_of_weights(da, dim=dim) + + return weighted_sum / sum_of_weights + + def _implementation(self, func, dim, **kwargs): + + raise NotImplementedError("Use `Dataset.weighted` or `DataArray.weighted`") + + def sum_of_weights( + self, + dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, + keep_attrs: Optional[bool] = None, + ) -> Union["DataArray", "Dataset"]: + + return self._implementation( + self._sum_of_weights, dim=dim, keep_attrs=keep_attrs + ) + + def sum( + self, + dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, + skipna: Optional[bool] = None, + keep_attrs: Optional[bool] = None, + ) -> Union["DataArray", "Dataset"]: + + return self._implementation( + self._weighted_sum, dim=dim, skipna=skipna, keep_attrs=keep_attrs + ) + + def mean( + self, + dim: Optional[Union[Hashable, Iterable[Hashable]]] = None, + skipna: Optional[bool] = None, + keep_attrs: Optional[bool] = None, + ) -> Union["DataArray", "Dataset"]: + + return self._implementation( + self._weighted_mean, dim=dim, skipna=skipna, keep_attrs=keep_attrs + ) + + def __repr__(self): + """provide a nice str repr of our Weighted object""" + + klass = self.__class__.__name__ + weight_dims = ", ".join(self.weights.dims) + return f"{klass} with weights along dimensions: {weight_dims}" + + +class DataArrayWeighted(Weighted): + def _implementation(self, func, dim, **kwargs): + + keep_attrs = kwargs.pop("keep_attrs") + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + + weighted = func(self.obj, dim=dim, **kwargs) + + if keep_attrs: + weighted.attrs = self.obj.attrs + + return weighted + + +class DatasetWeighted(Weighted): + def _implementation(self, func, dim, **kwargs) -> "Dataset": + + return self.obj.map(func, dim=dim, **kwargs) + + +def _inject_docstring(cls, cls_name): + + cls.sum_of_weights.__doc__ = _SUM_OF_WEIGHTS_DOCSTRING.format(cls=cls_name) + + cls.sum.__doc__ = _WEIGHTED_REDUCE_DOCSTRING_TEMPLATE.format( + cls=cls_name, fcn="sum", on_zero="0" + ) + + cls.mean.__doc__ = _WEIGHTED_REDUCE_DOCSTRING_TEMPLATE.format( + cls=cls_name, fcn="mean", on_zero="NaN" + ) + + +_inject_docstring(DataArrayWeighted, "DataArray") +_inject_docstring(DatasetWeighted, "Dataset") diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index 98131887e28..302cac05b05 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -329,7 +329,7 @@ def line( return primitive -def step(darray, *args, where="pre", linestyle=None, ls=None, **kwargs): +def step(darray, *args, where="pre", drawstyle=None, ds=None, **kwargs): """ Step plot of DataArray index against values @@ -359,16 +359,16 @@ def step(darray, *args, where="pre", linestyle=None, ls=None, **kwargs): if where not in {"pre", "post", "mid"}: raise ValueError("'where' argument to step must be " "'pre', 'post' or 'mid'") - if ls is not None: - if linestyle is None: - linestyle = ls + if ds is not None: + if drawstyle is None: + drawstyle = ds else: - raise TypeError("ls and linestyle are mutually exclusive") - if linestyle is None: - linestyle = "" - linestyle = "steps-" + where + linestyle + raise TypeError("ds and drawstyle are mutually exclusive") + if drawstyle is None: + drawstyle = "" + drawstyle = "steps-" + where + drawstyle - return line(darray, *args, linestyle=linestyle, **kwargs) + return line(darray, *args, drawstyle=drawstyle, **kwargs) def hist( diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index cb3bef6d409..e6c15037cb8 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -465,7 +465,7 @@ def _resolve_intervals_1dplot(xval, yval, xlabel, ylabel, kwargs): """ # Is it a step plot? (see matplotlib.Axes.step) - if kwargs.get("linestyle", "").startswith("steps-"): + if kwargs.get("drawstyle", "").startswith("steps-"): # Convert intervals to double points if _valid_other_type(np.array([xval, yval]), [pd.Interval]): @@ -476,7 +476,7 @@ def _resolve_intervals_1dplot(xval, yval, xlabel, ylabel, kwargs): yval, xval = _interval_to_double_bound_points(yval, xval) # Remove steps-* to be sure that matplotlib is not confused - del kwargs["linestyle"] + del kwargs["drawstyle"] # Is it another kind of plot? else: diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index f178720a6e1..b3640722106 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -7,6 +7,7 @@ from . import ( assert_array_equal, assert_equal, + assert_identical, raises_regex, requires_cftime, requires_dask, @@ -79,7 +80,7 @@ def test_strftime(self): def test_not_datetime_type(self): nontime_data = self.data.copy() int_data = np.arange(len(self.data.time)).astype("int8") - nontime_data["time"].values = int_data + nontime_data = nontime_data.assign_coords(time=int_data) with raises_regex(TypeError, "dt"): nontime_data.time.dt @@ -212,7 +213,7 @@ def setup(self): def test_not_datetime_type(self): nontime_data = self.data.copy() int_data = np.arange(len(self.data.time)).astype("int8") - nontime_data["time"].values = int_data + nontime_data = nontime_data.assign_coords(time=int_data) with raises_regex(TypeError, "dt"): nontime_data.time.dt @@ -346,6 +347,7 @@ def test_field_access(data, field): @requires_cftime +@pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_cftime_strftime_access(data): """ compare cftime formatting against datetime formatting """ date_format = "%Y%m%d%H" @@ -435,3 +437,106 @@ def test_seasons(cftime_date_type): seasons = xr.DataArray(seasons) assert_array_equal(seasons.values, dates.dt.season.values) + + +@pytest.fixture +def cftime_rounding_dataarray(cftime_date_type): + return xr.DataArray( + [ + [cftime_date_type(1, 1, 1, 1), cftime_date_type(1, 1, 1, 15)], + [cftime_date_type(1, 1, 1, 23), cftime_date_type(1, 1, 2, 1)], + ] + ) + + +@requires_cftime +@requires_dask +@pytest.mark.parametrize("use_dask", [False, True]) +def test_cftime_floor_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask): + import dask.array as da + + freq = "D" + expected = xr.DataArray( + [ + [cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 1, 0)], + [cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 2, 0)], + ], + name="floor", + ) + + if use_dask: + chunks = {"dim_0": 1} + # Currently a compute is done to inspect a single value of the array + # if it is of object dtype to check if it is a cftime.datetime (if not + # we raise an error when using the dt accessor). + with raise_if_dask_computes(max_computes=1): + result = cftime_rounding_dataarray.chunk(chunks).dt.floor(freq) + expected = expected.chunk(chunks) + assert isinstance(result.data, da.Array) + assert result.chunks == expected.chunks + else: + result = cftime_rounding_dataarray.dt.floor(freq) + + assert_identical(result, expected) + + +@requires_cftime +@requires_dask +@pytest.mark.parametrize("use_dask", [False, True]) +def test_cftime_ceil_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask): + import dask.array as da + + freq = "D" + expected = xr.DataArray( + [ + [cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 2, 0)], + [cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 3, 0)], + ], + name="ceil", + ) + + if use_dask: + chunks = {"dim_0": 1} + # Currently a compute is done to inspect a single value of the array + # if it is of object dtype to check if it is a cftime.datetime (if not + # we raise an error when using the dt accessor). + with raise_if_dask_computes(max_computes=1): + result = cftime_rounding_dataarray.chunk(chunks).dt.ceil(freq) + expected = expected.chunk(chunks) + assert isinstance(result.data, da.Array) + assert result.chunks == expected.chunks + else: + result = cftime_rounding_dataarray.dt.ceil(freq) + + assert_identical(result, expected) + + +@requires_cftime +@requires_dask +@pytest.mark.parametrize("use_dask", [False, True]) +def test_cftime_round_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask): + import dask.array as da + + freq = "D" + expected = xr.DataArray( + [ + [cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 2, 0)], + [cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 2, 0)], + ], + name="round", + ) + + if use_dask: + chunks = {"dim_0": 1} + # Currently a compute is done to inspect a single value of the array + # if it is of object dtype to check if it is a cftime.datetime (if not + # we raise an error when using the dt accessor). + with raise_if_dask_computes(max_computes=1): + result = cftime_rounding_dataarray.chunk(chunks).dt.round(freq) + expected = expected.chunk(chunks) + assert isinstance(result.data, da.Array) + assert result.chunks == expected.chunks + else: + result = cftime_rounding_dataarray.dt.round(freq) + + assert_identical(result, expected) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b7ba70ef6c4..82fe1b38149 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1729,39 +1729,52 @@ def test_hidden_zarr_keys(self): pass @pytest.mark.skipif(LooseVersion(dask_version) < "2.4", reason="dask GH5334") - def test_write_persistence_modes(self): + @pytest.mark.parametrize("group", [None, "group1"]) + def test_write_persistence_modes(self, group): original = create_test_data() # overwrite mode - with self.roundtrip(original, save_kwargs={"mode": "w"}) as actual: + with self.roundtrip( + original, + save_kwargs={"mode": "w", "group": group}, + open_kwargs={"group": group}, + ) as actual: assert_identical(original, actual) # don't overwrite mode - with self.roundtrip(original, save_kwargs={"mode": "w-"}) as actual: + with self.roundtrip( + original, + save_kwargs={"mode": "w-", "group": group}, + open_kwargs={"group": group}, + ) as actual: assert_identical(original, actual) # make sure overwriting works as expected with self.create_zarr_target() as store: self.save(original, store) # should overwrite with no error - self.save(original, store, mode="w") - with self.open(store) as actual: + self.save(original, store, mode="w", group=group) + with self.open(store, group=group) as actual: assert_identical(original, actual) with pytest.raises(ValueError): self.save(original, store, mode="w-") # check append mode for normal write - with self.roundtrip(original, save_kwargs={"mode": "a"}) as actual: + with self.roundtrip( + original, + save_kwargs={"mode": "a", "group": group}, + open_kwargs={"group": group}, + ) as actual: assert_identical(original, actual) - ds, ds_to_append, _ = create_append_test_data() - # check append mode for append write + ds, ds_to_append, _ = create_append_test_data() with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w") - ds_to_append.to_zarr(store_target, append_dim="time") + ds.to_zarr(store_target, mode="w", group=group) + ds_to_append.to_zarr(store_target, append_dim="time", group=group) original = xr.concat([ds, ds_to_append], dim="time") - assert_identical(original, xr.open_zarr(store_target)) + actual = xr.open_zarr(store_target, group=group) + assert_identical(original, actual) def test_compressor_encoding(self): original = create_test_data() @@ -1908,33 +1921,36 @@ def test_to_zarr_append_compute_false_roundtrip(self): ds, ds_to_append, _ = create_append_test_data() ds, ds_to_append = ds.chunk(), ds_to_append.chunk() - with self.create_zarr_target() as store: - delayed_obj = self.save(ds, store, compute=False, mode="w") - assert isinstance(delayed_obj, Delayed) + with pytest.warns(SerializationWarning): + with self.create_zarr_target() as store: + delayed_obj = self.save(ds, store, compute=False, mode="w") + assert isinstance(delayed_obj, Delayed) + + with pytest.raises(AssertionError): + with self.open(store) as actual: + assert_identical(ds, actual) + + delayed_obj.compute() - with pytest.raises(AssertionError): with self.open(store) as actual: assert_identical(ds, actual) - delayed_obj.compute() + delayed_obj = self.save( + ds_to_append, store, compute=False, append_dim="time" + ) + assert isinstance(delayed_obj, Delayed) - with self.open(store) as actual: - assert_identical(ds, actual) + with pytest.raises(AssertionError): + with self.open(store) as actual: + assert_identical( + xr.concat([ds, ds_to_append], dim="time"), actual + ) - delayed_obj = self.save( - ds_to_append, store, compute=False, append_dim="time" - ) - assert isinstance(delayed_obj, Delayed) + delayed_obj.compute() - with pytest.raises(AssertionError): with self.open(store) as actual: assert_identical(xr.concat([ds, ds_to_append], dim="time"), actual) - delayed_obj.compute() - - with self.open(store) as actual: - assert_identical(xr.concat([ds, ds_to_append], dim="time"), actual) - def test_encoding_chunksizes(self): # regression test for GH2278 # see also test_encoding_chunksizes_unlimited @@ -1966,24 +1982,8 @@ def create_zarr_target(self): yield tmp -class ScipyWriteBase(CFEncodedBase, NetCDF3Only): - def test_append_write(self): - import scipy - - if scipy.__version__ == "1.0.1": - pytest.xfail("https://github.com/scipy/scipy/issues/8625") - super().test_append_write() - - def test_append_overwrite_values(self): - import scipy - - if scipy.__version__ == "1.0.1": - pytest.xfail("https://github.com/scipy/scipy/issues/8625") - super().test_append_overwrite_values() - - @requires_scipy -class TestScipyInMemoryData(ScipyWriteBase): +class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only): engine = "scipy" @contextlib.contextmanager @@ -2004,7 +2004,7 @@ def test_bytes_pickle(self): @requires_scipy -class TestScipyFileObject(ScipyWriteBase): +class TestScipyFileObject(CFEncodedBase, NetCDF3Only): engine = "scipy" @contextlib.contextmanager @@ -2037,7 +2037,7 @@ def test_pickle_dataarray(self): @requires_scipy -class TestScipyFilePath(ScipyWriteBase): +class TestScipyFilePath(CFEncodedBase, NetCDF3Only): engine = "scipy" @contextlib.contextmanager @@ -3304,7 +3304,7 @@ def test_session(self): @requires_scipy @requires_pynio -class TestPyNio(ScipyWriteBase): +class TestPyNio(CFEncodedBase, NetCDF3Only): def test_write_store(self): # pynio is read-only for now pass @@ -3522,6 +3522,7 @@ def test_uamiv_format_mfread(self): ["example.uamiv", "example.uamiv"], engine="pseudonetcdf", concat_dim="TSTEP", + combine="nested", backend_kwargs={"format": "uamiv"}, ) @@ -3547,6 +3548,7 @@ def test_uamiv_format_mfread(self): assert_allclose(expected, actual) camxfile.close() + @pytest.mark.xfail(reason="Flaky; see GH3711") def test_uamiv_format_write(self): fmtkw = {"format": "uamiv"} @@ -4496,3 +4498,50 @@ def test_invalid_netcdf_raises(engine): data = create_test_data() with raises_regex(ValueError, "unrecognized option 'invalid_netcdf'"): data.to_netcdf("foo.nc", engine=engine, invalid_netcdf=True) + + +@requires_zarr +def test_encode_zarr_attr_value(): + # array -> list + arr = np.array([1, 2, 3]) + expected = [1, 2, 3] + actual = backends.zarr.encode_zarr_attr_value(arr) + assert isinstance(actual, list) + assert actual == expected + + # scalar array -> scalar + sarr = np.array(1)[()] + expected = 1 + actual = backends.zarr.encode_zarr_attr_value(sarr) + assert isinstance(actual, int) + assert actual == expected + + # string -> string (no change) + expected = "foo" + actual = backends.zarr.encode_zarr_attr_value(expected) + assert isinstance(actual, str) + assert actual == expected + + +@requires_zarr +def test_extract_zarr_variable_encoding(): + + var = xr.Variable("x", [1, 2]) + actual = backends.zarr.extract_zarr_variable_encoding(var) + assert "chunks" in actual + assert actual["chunks"] is None + + var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)}) + actual = backends.zarr.extract_zarr_variable_encoding(var) + assert actual["chunks"] == (1,) + + # does not raise on invalid + var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) + actual = backends.zarr.extract_zarr_variable_encoding(var) + + # raises on invalid + var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) + with raises_regex(ValueError, "unexpected encoding parameters"): + actual = backends.zarr.extract_zarr_variable_encoding( + var, raise_on_invalid=True + ) diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 343e059f53c..2352f9e8cdd 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1176,6 +1176,7 @@ def test_dayofweek_after_cftime_range(freq): np.testing.assert_array_equal(result, expected) +@pytest.mark.xfail(reason="See GH3885") @pytest.mark.parametrize("freq", ["A", "M", "D"]) def test_dayofyear_after_cftime_range(freq): pytest.importorskip("cftime", minversion="1.0.2.1") diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 8025766529e..d31bf9471ea 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -450,11 +450,21 @@ def test_sel_date_scalar(da, date_type, index): assert_identical(result, expected) -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") +@requires_cftime +def test_sel_date_distant_date(da, date_type, index): + expected = xr.DataArray(4).assign_coords(time=index[3]) + result = da.sel(time=date_type(2000, 1, 1), method="nearest") + assert_identical(result, expected) + + @requires_cftime @pytest.mark.parametrize( "sel_kwargs", - [{"method": "nearest"}, {"method": "nearest", "tolerance": timedelta(days=70)}], + [ + {"method": "nearest"}, + {"method": "nearest", "tolerance": timedelta(days=70)}, + {"method": "nearest", "tolerance": timedelta(days=1800000)}, + ], ) def test_sel_date_scalar_nearest(da, date_type, index, sel_kwargs): expected = xr.DataArray(2).assign_coords(time=index[1]) @@ -502,12 +512,7 @@ def test_sel_date_scalar_backfill(da, date_type, index, sel_kwargs): [ {"method": "pad", "tolerance": timedelta(days=20)}, {"method": "backfill", "tolerance": timedelta(days=20)}, - pytest.param( - {"method": "nearest", "tolerance": timedelta(days=20)}, - marks=pytest.mark.xfail( - reason="https://github.com/pydata/xarray/issues/3751" - ), - ), + {"method": "nearest", "tolerance": timedelta(days=20)}, ], ) def test_sel_date_scalar_tolerance_raises(da, date_type, sel_kwargs): @@ -515,7 +520,6 @@ def test_sel_date_scalar_tolerance_raises(da, date_type, sel_kwargs): da.sel(time=date_type(1, 5, 1), **sel_kwargs) -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") @requires_cftime @pytest.mark.parametrize( "sel_kwargs", @@ -563,12 +567,7 @@ def test_sel_date_list_backfill(da, date_type, index, sel_kwargs): [ {"method": "pad", "tolerance": timedelta(days=20)}, {"method": "backfill", "tolerance": timedelta(days=20)}, - pytest.param( - {"method": "nearest", "tolerance": timedelta(days=20)}, - marks=pytest.mark.xfail( - reason="https://github.com/pydata/xarray/issues/3751" - ), - ), + {"method": "nearest", "tolerance": timedelta(days=20)}, ], ) def test_sel_date_list_tolerance_raises(da, date_type, sel_kwargs): @@ -603,7 +602,6 @@ def range_args(date_type): ] -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") @requires_cftime def test_indexing_in_series_getitem(series, index, scalar_args, range_args): for arg in scalar_args: @@ -738,7 +736,7 @@ def test_timedeltaindex_add_cftimeindex(calendar): @requires_cftime -def test_cftimeindex_sub(index): +def test_cftimeindex_sub_timedelta(index): date_type = index.date_type expected_dates = [ date_type(1, 1, 2), @@ -753,6 +751,27 @@ def test_cftimeindex_sub(index): assert isinstance(result, CFTimeIndex) +@requires_cftime +@pytest.mark.parametrize( + "other", + [np.array(4 * [timedelta(days=1)]), np.array(timedelta(days=1))], + ids=["1d-array", "scalar-array"], +) +def test_cftimeindex_sub_timedelta_array(index, other): + date_type = index.date_type + expected_dates = [ + date_type(1, 1, 2), + date_type(1, 2, 2), + date_type(2, 1, 2), + date_type(2, 2, 2), + ] + expected = CFTimeIndex(expected_dates) + result = index + timedelta(days=2) + result = result - other + assert result.equals(expected) + assert isinstance(result, CFTimeIndex) + + @requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftimeindex_sub_cftimeindex(calendar): @@ -784,6 +803,14 @@ def test_cftime_datetime_sub_cftimeindex(calendar): assert isinstance(result, pd.TimedeltaIndex) +@requires_cftime +@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) +def test_distant_cftime_datetime_sub_cftimeindex(calendar): + a = xr.cftime_range("2000", periods=5, calendar=calendar) + with pytest.raises(ValueError, match="difference exceeds"): + a.date_type(1, 1, 1) - a + + @requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftimeindex_sub_timedeltaindex(calendar): @@ -795,6 +822,25 @@ def test_cftimeindex_sub_timedeltaindex(calendar): assert isinstance(result, CFTimeIndex) +@requires_cftime +@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) +def test_cftimeindex_sub_index_of_cftime_datetimes(calendar): + a = xr.cftime_range("2000", periods=5, calendar=calendar) + b = pd.Index(a.values) + expected = a - a + result = a - b + assert result.equals(expected) + assert isinstance(result, pd.TimedeltaIndex) + + +@requires_cftime +@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) +def test_cftimeindex_sub_not_implemented(calendar): + a = xr.cftime_range("2000", periods=5, calendar=calendar) + with pytest.raises(TypeError, match="unsupported operand"): + a - 1 + + @requires_cftime def test_cftimeindex_rsub(index): with pytest.raises(TypeError): @@ -904,3 +950,92 @@ def test_multiindex(): index = xr.cftime_range("2001-01-01", periods=100, calendar="360_day") mindex = pd.MultiIndex.from_arrays([index]) assert mindex.get_loc("2001-01") == slice(0, 30) + + +@requires_cftime +@pytest.mark.parametrize("freq", ["3663S", "33T", "2H"]) +@pytest.mark.parametrize("method", ["floor", "ceil", "round"]) +def test_rounding_methods_against_datetimeindex(freq, method): + expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777S") + expected = getattr(expected, method)(freq) + result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777S") + result = getattr(result, method)(freq).to_datetimeindex() + assert result.equals(expected) + + +@requires_cftime +@pytest.mark.parametrize("method", ["floor", "ceil", "round"]) +def test_rounding_methods_invalid_freq(method): + index = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777S") + with pytest.raises(ValueError, match="fixed"): + getattr(index, method)("MS") + + +@pytest.fixture +def rounding_index(date_type): + return xr.CFTimeIndex( + [ + date_type(1, 1, 1, 1, 59, 59, 999512), + date_type(1, 1, 1, 3, 0, 1, 500001), + date_type(1, 1, 1, 7, 0, 6, 499999), + ] + ) + + +@requires_cftime +def test_ceil(rounding_index, date_type): + result = rounding_index.ceil("S") + expected = xr.CFTimeIndex( + [ + date_type(1, 1, 1, 2, 0, 0, 0), + date_type(1, 1, 1, 3, 0, 2, 0), + date_type(1, 1, 1, 7, 0, 7, 0), + ] + ) + assert result.equals(expected) + + +@requires_cftime +def test_floor(rounding_index, date_type): + result = rounding_index.floor("S") + expected = xr.CFTimeIndex( + [ + date_type(1, 1, 1, 1, 59, 59, 0), + date_type(1, 1, 1, 3, 0, 1, 0), + date_type(1, 1, 1, 7, 0, 6, 0), + ] + ) + assert result.equals(expected) + + +@requires_cftime +def test_round(rounding_index, date_type): + result = rounding_index.round("S") + expected = xr.CFTimeIndex( + [ + date_type(1, 1, 1, 2, 0, 0, 0), + date_type(1, 1, 1, 3, 0, 2, 0), + date_type(1, 1, 1, 7, 0, 6, 0), + ] + ) + assert result.equals(expected) + + +@requires_cftime +def test_asi8(date_type): + index = xr.CFTimeIndex([date_type(1970, 1, 1), date_type(1970, 1, 2)]) + result = index.asi8 + expected = 1000000 * 86400 * np.array([0, 1]) + np.testing.assert_array_equal(result, expected) + + +@requires_cftime +def test_asi8_distant_date(): + """Test that asi8 conversion is truly exact.""" + import cftime + + date_type = cftime.DatetimeProlepticGregorian + index = xr.CFTimeIndex([date_type(10731, 4, 22, 3, 25, 45, 123456)]) + result = index.asi8 + expected = np.array([1000000 * 86400 * 400 * 8000 + 12345 * 1000000 + 123456]) + np.testing.assert_array_equal(result, expected) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index eb2c6e1dbf7..c3f981f10d1 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -503,6 +503,49 @@ def test_auto_combine_2d(self): result = combine_nested(datasets, concat_dim=["dim1", "dim2"]) assert_equal(result, expected) + def test_auto_combine_2d_combine_attrs_kwarg(self): + ds = create_test_data + + partway1 = concat([ds(0), ds(3)], dim="dim1") + partway2 = concat([ds(1), ds(4)], dim="dim1") + partway3 = concat([ds(2), ds(5)], dim="dim1") + expected = concat([partway1, partway2, partway3], dim="dim2") + + expected_dict = {} + expected_dict["drop"] = expected.copy(deep=True) + expected_dict["drop"].attrs = {} + expected_dict["no_conflicts"] = expected.copy(deep=True) + expected_dict["no_conflicts"].attrs = { + "a": 1, + "b": 2, + "c": 3, + "d": 4, + "e": 5, + "f": 6, + } + expected_dict["override"] = expected.copy(deep=True) + expected_dict["override"].attrs = {"a": 1} + + datasets = [[ds(0), ds(1), ds(2)], [ds(3), ds(4), ds(5)]] + + datasets[0][0].attrs = {"a": 1} + datasets[0][1].attrs = {"a": 1, "b": 2} + datasets[0][2].attrs = {"a": 1, "c": 3} + datasets[1][0].attrs = {"a": 1, "d": 4} + datasets[1][1].attrs = {"a": 1, "e": 5} + datasets[1][2].attrs = {"a": 1, "f": 6} + + with raises_regex(ValueError, "combine_attrs='identical'"): + result = combine_nested( + datasets, concat_dim=["dim1", "dim2"], combine_attrs="identical" + ) + + for combine_attrs in expected_dict: + result = combine_nested( + datasets, concat_dim=["dim1", "dim2"], combine_attrs=combine_attrs + ) + assert_identical(result, expected_dict[combine_attrs]) + def test_combine_nested_missing_data_new_dim(self): # Your data includes "time" and "station" dimensions, and each year's # data has a different set of stations. @@ -642,6 +685,52 @@ def test_combine_coords_join_exact(self): with raises_regex(ValueError, "indexes along dimension"): combine_nested(objs, concat_dim="x", join="exact") + @pytest.mark.parametrize( + "combine_attrs, expected", + [ + ("drop", Dataset({"x": [0, 1], "y": [0, 1]}, attrs={})), + ( + "no_conflicts", + Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "b": 2}), + ), + ("override", Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1})), + ], + ) + def test_combine_coords_combine_attrs(self, combine_attrs, expected): + objs = [ + Dataset({"x": [0], "y": [0]}, attrs={"a": 1}), + Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 2}), + ] + actual = combine_nested( + objs, concat_dim="x", join="outer", combine_attrs=combine_attrs + ) + assert_identical(expected, actual) + + if combine_attrs == "no_conflicts": + objs[1].attrs["a"] = 2 + with raises_regex(ValueError, "combine_attrs='no_conflicts'"): + actual = combine_nested( + objs, concat_dim="x", join="outer", combine_attrs=combine_attrs + ) + + def test_combine_coords_combine_attrs_identical(self): + objs = [ + Dataset({"x": [0], "y": [0]}, attrs={"a": 1}), + Dataset({"x": [1], "y": [1]}, attrs={"a": 1}), + ] + expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1}) + actual = combine_nested( + objs, concat_dim="x", join="outer", combine_attrs="identical" + ) + assert_identical(expected, actual) + + objs[1].attrs["b"] = 2 + + with raises_regex(ValueError, "combine_attrs='identical'"): + actual = combine_nested( + objs, concat_dim="x", join="outer", combine_attrs="identical" + ) + def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 369903552ad..4eed464d2dc 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1120,3 +1120,35 @@ def test_where(): actual = xr.where(cond, 1, 0) expected = xr.DataArray([1, 0], dims="x") assert_identical(expected, actual) + + +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.parametrize("use_datetime", [True, False]) +def test_polyval(use_dask, use_datetime): + if use_dask and not has_dask: + pytest.skip("requires dask") + + if use_datetime: + xcoord = xr.DataArray( + pd.date_range("2000-01-01", freq="D", periods=10), dims=("x",), name="x" + ) + x = xr.core.missing.get_clean_interp_index(xcoord, "x") + else: + xcoord = x = np.arange(10) + + da = xr.DataArray( + np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + dims=("d", "x"), + coords={"x": xcoord, "d": [0, 1]}, + ) + coeffs = xr.DataArray( + [[2, 1, 1], [3, 2, 1]], + dims=("d", "degree"), + coords={"d": [0, 1], "degree": [2, 1, 0]}, + ) + if use_dask: + coeffs = coeffs.chunk({"d": 2}) + + da_pv = xr.polyval(da.x, coeffs) + + xr.testing.assert_allclose(da, da_pv.T) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index bd99181a947..e5038dd4af2 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -40,8 +40,7 @@ def test_concat_compat(): assert_equal(ds2.no_x_y, result.no_x_y.transpose()) for var in ["has_x", "no_x_y"]: - assert "y" not in result[var] - + assert "y" not in result[var].dims and "y" not in result[var].coords with raises_regex(ValueError, "coordinates in some datasets but not others"): concat([ds1, ds2], dim="q") with raises_regex(ValueError, "'q' is not present in all datasets"): @@ -250,6 +249,35 @@ def test_concat_join_kwarg(self): actual = concat([ds1, ds2], join=join, dim="x") assert_equal(actual, expected[join]) + # regression test for #3681 + actual = concat([ds1.drop("x"), ds2.drop("x")], join="override", dim="y") + expected = Dataset( + {"a": (("x", "y"), np.array([0, 0], ndmin=2))}, coords={"y": [0, 0.0001]} + ) + assert_identical(actual, expected) + + def test_concat_combine_attrs_kwarg(self): + ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs={"b": 42}) + ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs={"b": 42, "c": 43}) + + expected = {} + expected["drop"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}) + expected["no_conflicts"] = Dataset( + {"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42, "c": 43} + ) + expected["override"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42}) + + with raises_regex(ValueError, "combine_attrs='identical'"): + actual = concat([ds1, ds2], dim="x", combine_attrs="identical") + with raises_regex(ValueError, "combine_attrs='no_conflicts'"): + ds3 = ds2.copy(deep=True) + ds3.attrs["b"] = 44 + actual = concat([ds1, ds3], dim="x", combine_attrs="no_conflicts") + + for combine_attrs in expected: + actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) + assert_identical(actual, expected[combine_attrs]) + def test_concat_promote_shape(self): # mixed dims within variables objs = [Dataset({}, {"x": 0}), Dataset({"x": [1]})] @@ -463,6 +491,30 @@ def test_concat_join_kwarg(self): actual = concat([ds1, ds2], join=join, dim="x") assert_equal(actual, expected[join].to_array()) + def test_concat_combine_attrs_kwarg(self): + da1 = DataArray([0], coords=[("x", [0])], attrs={"b": 42}) + da2 = DataArray([0], coords=[("x", [1])], attrs={"b": 42, "c": 43}) + + expected = {} + expected["drop"] = DataArray([0, 0], coords=[("x", [0, 1])]) + expected["no_conflicts"] = DataArray( + [0, 0], coords=[("x", [0, 1])], attrs={"b": 42, "c": 43} + ) + expected["override"] = DataArray( + [0, 0], coords=[("x", [0, 1])], attrs={"b": 42} + ) + + with raises_regex(ValueError, "combine_attrs='identical'"): + actual = concat([da1, da2], dim="x", combine_attrs="identical") + with raises_regex(ValueError, "combine_attrs='no_conflicts'"): + da3 = da2.copy(deep=True) + da3.attrs["b"] = 44 + actual = concat([da1, da3], dim="x", combine_attrs="no_conflicts") + + for combine_attrs in expected: + actual = concat([da1, da2], dim="x", combine_attrs=combine_attrs) + assert_identical(actual, expected[combine_attrs]) + @pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {})) @pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {})) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 5bcc5d37f44..10230fc8dee 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1147,6 +1147,7 @@ def test_map_blocks_to_array(map_ds): lambda x: x.to_dataset(), lambda x: x.drop_vars("x"), lambda x: x.expand_dims(k=[1, 2, 3]), + lambda x: x.expand_dims(k=3), lambda x: x.assign_coords(new_coord=("y", x.y * 2)), lambda x: x.astype(np.int32), lambda x: x.x, @@ -1167,6 +1168,7 @@ def test_map_blocks_da_transformations(func, map_da): lambda x: x.drop_vars("a"), lambda x: x.drop_vars("x"), lambda x: x.expand_dims(k=[1, 2, 3]), + lambda x: x.expand_dims(k=3), lambda x: x.rename({"a": "new1", "b": "new2"}), lambda x: x.x, ], @@ -1304,7 +1306,7 @@ def test_token_changes_when_data_changes(obj): assert t3 != t2 # Change IndexVariable - obj.coords["x"] *= 2 + obj = obj.assign_coords(x=obj.x * 2) with raise_if_dask_computes(): t4 = dask.base.tokenize(obj) assert t4 != t3 @@ -1374,6 +1376,7 @@ def test_normalize_token_with_backend(map_ds): map_ds.to_netcdf(tmp_file) read = xr.open_dataset(tmp_file) assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read) + read.close() @pytest.mark.parametrize( diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 0a622d279ba..e23ff2f7e31 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -23,6 +23,7 @@ assert_array_equal, assert_equal, assert_identical, + has_dask, raises_regex, requires_bottleneck, requires_dask, @@ -1412,6 +1413,12 @@ def test_coords_non_string(self): expected = DataArray(2, coords={1: 2}, name=1) assert_identical(actual, expected) + def test_coords_delitem_delete_indexes(self): + # regression test for GH3746 + arr = DataArray(np.ones((2,)), dims="x", coords={"x": [0, 1]}) + del arr.coords["x"] + assert "x" not in arr.indexes + def test_broadcast_like(self): arr1 = DataArray( np.ones((2, 3)), @@ -2035,11 +2042,14 @@ def test_stack_unstack(self): codes=[[], []], names=["x", "y"], ) - pd.util.testing.assert_index_equal(a, b) + pd.testing.assert_index_equal(a, b) actual = orig.stack(z=["x", "y"]).unstack("z").drop_vars(["x", "y"]) assert_identical(orig, actual) + actual = orig.stack(z=[...]).unstack("z").drop_vars(["x", "y"]) + assert_identical(orig, actual) + dims = ["a", "b", "c", "d", "e"] orig = xr.DataArray(np.random.rand(1, 2, 3, 2, 1), dims=dims) stacked = orig.stack(ab=["a", "b"], cd=["c", "d"]) @@ -2215,6 +2225,12 @@ def test_where(self): actual = arr.where(arr.x < 2, drop=True) assert_identical(actual, expected) + def test_where_lambda(self): + arr = DataArray(np.arange(4), dims="y") + expected = arr.sel(y=slice(2)) + actual = arr.where(lambda x: x.y < 2, drop=True) + assert_identical(actual, expected) + def test_where_string(self): array = DataArray(["a", "b"]) expected = DataArray(np.array(["a", np.nan], dtype=object)) @@ -2362,13 +2378,15 @@ def test_reduce_out(self): with pytest.raises(TypeError): orig.mean(out=np.ones(orig.shape)) + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]]) @pytest.mark.parametrize( "axis, dim", zip([None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]]) ) - def test_quantile(self, q, axis, dim): - actual = DataArray(self.va).quantile(q, dim=dim, keep_attrs=True) - expected = np.nanpercentile(self.dv.values, np.array(q) * 100, axis=axis) + def test_quantile(self, q, axis, dim, skipna): + actual = DataArray(self.va).quantile(q, dim=dim, keep_attrs=True, skipna=skipna) + _percentile_func = np.nanpercentile if skipna else np.percentile + expected = _percentile_func(self.dv.values, np.array(q) * 100, axis=axis) np.testing.assert_allclose(actual.values, expected) if is_scalar(q): assert "quantile" not in actual.dims @@ -3403,14 +3421,10 @@ def test_to_pandas(self): assert_array_equal(actual.columns, [0, 1]) # roundtrips - for shape in [(3,), (3, 4), (3, 4, 5)]: - if len(shape) > 2 and LooseVersion(pd.__version__) >= "0.25.0": - continue + for shape in [(3,), (3, 4)]: dims = list("abc")[: len(shape)] da = DataArray(np.random.randn(*shape), dims=dims) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", r"\W*Panel is deprecated") - roundtripped = DataArray(da.to_pandas()).drop_vars(dims) + roundtripped = DataArray(da.to_pandas()).drop_vars(dims) assert_identical(da, roundtripped) with raises_regex(ValueError, "cannot convert"): @@ -3484,7 +3498,7 @@ def test_from_series_sparse(self): def test_to_and_from_empty_series(self): # GH697 - expected = pd.Series([]) + expected = pd.Series([], dtype=np.float64) da = DataArray.from_series(expected) assert len(da) == 0 actual = da.to_series() @@ -3737,9 +3751,16 @@ def test_to_dataset_whole(self): expected = Dataset({"foo": ("x", [1, 2])}) assert_identical(expected, actual) - named = DataArray([1, 2], dims="x", name="foo") + named = DataArray([1, 2], dims="x", name="foo", attrs={"y": "testattr"}) actual = named.to_dataset() - expected = Dataset({"foo": ("x", [1, 2])}) + expected = Dataset({"foo": ("x", [1, 2], {"y": "testattr"})}) + assert_identical(expected, actual) + + # Test promoting attrs + actual = named.to_dataset(promote_attrs=True) + expected = Dataset( + {"foo": ("x", [1, 2], {"y": "testattr"})}, attrs={"y": "testattr"} + ) assert_identical(expected, actual) with pytest.raises(TypeError): @@ -4171,6 +4192,162 @@ def test_rank(self): y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=("z",)) assert_equal(y.rank("z", pct=True), y) + @pytest.mark.parametrize("use_dask", [True, False]) + @pytest.mark.parametrize("use_datetime", [True, False]) + def test_polyfit(self, use_dask, use_datetime): + if use_dask and not has_dask: + pytest.skip("requires dask") + xcoord = xr.DataArray( + pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" + ) + x = xr.core.missing.get_clean_interp_index(xcoord, "x") + if not use_datetime: + xcoord = x + + da_raw = DataArray( + np.stack( + (10 + 1e-15 * x + 2e-28 * x ** 2, 30 + 2e-14 * x + 1e-29 * x ** 2) + ), + dims=("d", "x"), + coords={"x": xcoord, "d": [0, 1]}, + ) + + if use_dask: + da = da_raw.chunk({"d": 1}) + else: + da = da_raw + + out = da.polyfit("x", 2) + expected = DataArray( + [[2e-28, 1e-15, 10], [1e-29, 2e-14, 30]], + dims=("d", "degree"), + coords={"degree": [2, 1, 0], "d": [0, 1]}, + ).T + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + + # With NaN + da_raw[0, 1] = np.nan + if use_dask: + da = da_raw.chunk({"d": 1}) + else: + da = da_raw + out = da.polyfit("x", 2, skipna=True, cov=True) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + assert "polyfit_covariance" in out + + # Skipna + Full output + out = da.polyfit("x", 2, skipna=True, full=True) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + assert out.x_matrix_rank == 3 + np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0]) + + def test_pad_constant(self): + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad(dim_0=(1, 3)) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5).astype(np.float32), + mode="constant", + pad_width=((1, 3), (0, 0), (0, 0)), + constant_values=np.nan, + ) + ) + assert actual.shape == (7, 4, 5) + assert_identical(actual, expected) + + def test_pad_coords(self): + ar = DataArray( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + [("x", np.arange(3)), ("y", np.arange(4)), ("z", np.arange(5))], + ) + actual = ar.pad(x=(1, 3), constant_values=1) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + mode="constant", + pad_width=((1, 3), (0, 0), (0, 0)), + constant_values=1, + ), + [ + ( + "x", + np.pad( + np.arange(3).astype(np.float32), + mode="constant", + pad_width=(1, 3), + constant_values=np.nan, + ), + ), + ("y", np.arange(4)), + ("z", np.arange(5)), + ], + ) + assert_identical(actual, expected) + + @pytest.mark.parametrize("mode", ("minimum", "maximum", "mean", "median")) + @pytest.mark.parametrize( + "stat_length", (None, 3, (1, 3), {"dim_0": (2, 1), "dim_2": (4, 2)}) + ) + def test_pad_stat_length(self, mode, stat_length): + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad(dim_0=(1, 3), dim_2=(2, 2), mode=mode, stat_length=stat_length) + if isinstance(stat_length, dict): + stat_length = (stat_length["dim_0"], (4, 4), stat_length["dim_2"]) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + pad_width=((1, 3), (0, 0), (2, 2)), + mode=mode, + stat_length=stat_length, + ) + ) + assert actual.shape == (7, 4, 9) + assert_identical(actual, expected) + + @pytest.mark.parametrize( + "end_values", (None, 3, (3, 5), {"dim_0": (2, 1), "dim_2": (4, 2)}) + ) + def test_pad_linear_ramp(self, end_values): + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad( + dim_0=(1, 3), dim_2=(2, 2), mode="linear_ramp", end_values=end_values + ) + if end_values is None: + end_values = 0 + elif isinstance(end_values, dict): + end_values = (end_values["dim_0"], (4, 4), end_values["dim_2"]) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + pad_width=((1, 3), (0, 0), (2, 2)), + mode="linear_ramp", + end_values=end_values, + ) + ) + assert actual.shape == (7, 4, 9) + assert_identical(actual, expected) + + @pytest.mark.parametrize("mode", ("reflect", "symmetric")) + @pytest.mark.parametrize("reflect_type", (None, "even", "odd")) + def test_pad_reflect(self, mode, reflect_type): + + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad( + dim_0=(1, 3), dim_2=(2, 2), mode=mode, reflect_type=reflect_type + ) + np_kwargs = { + "array": np.arange(3 * 4 * 5).reshape(3, 4, 5), + "pad_width": ((1, 3), (0, 0), (2, 2)), + "mode": mode, + } + # numpy does not support reflect_type=None + if reflect_type is not None: + np_kwargs["reflect_type"] = reflect_type + expected = DataArray(np.pad(**np_kwargs)) + + assert actual.shape == (7, 4, 9) + assert_identical(actual, expected) + @pytest.fixture(params=[1]) def da(request): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5e254c37e44..02698253e5d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -744,6 +744,10 @@ def test_coords_modify(self): expected = data.merge({"c": 11}).set_coords("c") assert_identical(expected, actual) + # regression test for GH3746 + del actual.coords["x"] + assert "x" not in actual.indexes + def test_update_index(self): actual = Dataset(coords={"x": [1, 2, 3]}) actual["x"] = ["a", "b", "c"] @@ -1458,6 +1462,17 @@ def test_categorical_reindex(self): actual = ds.reindex(cat=["foo"])["cat"].values assert (actual == np.array(["foo"])).all() + def test_categorical_multiindex(self): + i1 = pd.Series([0, 0]) + cat = pd.CategoricalDtype(categories=["foo", "baz", "bar"]) + i2 = pd.Series(["baz", "bar"], dtype=cat) + + df = pd.DataFrame({"i1": i1, "i2": i2, "values": [1, 2]}).set_index( + ["i1", "i2"] + ) + actual = df.to_xarray() + assert actual["values"].shape == (1, 2) + def test_sel_drop(self): data = Dataset({"foo": ("x", [1, 2, 3])}, {"x": [0, 1, 2]}) expected = Dataset({"foo": 1}) @@ -2868,6 +2883,17 @@ def test_stack(self): actual = ds.stack(z=["x", "y"]) assert_identical(expected, actual) + actual = ds.stack(z=[...]) + assert_identical(expected, actual) + + # non list dims with ellipsis + actual = ds.stack(z=(...,)) + assert_identical(expected, actual) + + # ellipsis with given dim + actual = ds.stack(z=[..., "y"]) + assert_identical(expected, actual) + exp_index = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=["y", "x"]) expected = Dataset( {"a": ("z", [0, 1, 0, 1]), "b": ("z", [0, 2, 1, 3]), "z": exp_index} @@ -4349,6 +4375,12 @@ def test_where(self): assert actual.a.name == "a" assert actual.a.attrs == ds.a.attrs + # lambda + ds = Dataset({"a": ("x", range(5))}) + expected = Dataset({"a": ("x", [np.nan, np.nan, 2, 3, 4])}) + actual = ds.where(lambda x: x > 1) + assert_identical(expected, actual) + def test_where_other(self): ds = Dataset({"a": ("x", range(5))}, {"x": range(5)}) expected = Dataset({"a": ("x", [-1, -1, 2, 3, 4])}, {"x": range(5)}) @@ -4356,6 +4388,9 @@ def test_where_other(self): assert_equal(expected, actual) assert actual.a.dtype == int + actual = ds.where(lambda x: x > 1, -1) + assert_equal(expected, actual) + with raises_regex(ValueError, "cannot set"): ds.where(ds > 1, other=0, drop=True) @@ -4688,12 +4723,13 @@ def test_reduce_keepdims(self): ) assert_identical(expected, actual) + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]]) - def test_quantile(self, q): + def test_quantile(self, q, skipna): ds = create_test_data(seed=123) for dim in [None, "dim1", ["dim1"]]: - ds_quantile = ds.quantile(q, dim=dim) + ds_quantile = ds.quantile(q, dim=dim, skipna=skipna) if is_scalar(q): assert "quantile" not in ds_quantile.dims else: @@ -4701,12 +4737,27 @@ def test_quantile(self, q): for var, dar in ds.data_vars.items(): assert var in ds_quantile - assert_identical(ds_quantile[var], dar.quantile(q, dim=dim)) + assert_identical( + ds_quantile[var], dar.quantile(q, dim=dim, skipna=skipna) + ) dim = ["dim1", "dim2"] - ds_quantile = ds.quantile(q, dim=dim) + ds_quantile = ds.quantile(q, dim=dim, skipna=skipna) assert "dim3" in ds_quantile.dims assert all(d not in ds_quantile.dims for d in dim) + @pytest.mark.parametrize("skipna", [True, False]) + def test_quantile_skipna(self, skipna): + q = 0.1 + dim = "time" + ds = Dataset({"a": ([dim], np.arange(0, 11))}) + ds = ds.where(ds >= 1) + + result = ds.quantile(q=q, dim=dim, skipna=skipna) + + value = 1.9 if skipna else np.nan + expected = Dataset({"a": value}, coords={"quantile": q}) + assert_identical(result, expected) + @requires_bottleneck def test_rank(self): ds = create_test_data(seed=1234) @@ -5448,6 +5499,32 @@ def test_ipython_key_completion(self): ds.data_vars[item] # should not raise assert sorted(actual) == sorted(expected) + def test_polyfit_output(self): + ds = create_test_data(seed=1) + + out = ds.polyfit("dim2", 2, full=False) + assert "var1_polyfit_coefficients" in out + + out = ds.polyfit("dim1", 2, full=True) + assert "var1_polyfit_coefficients" in out + assert "dim1_matrix_rank" in out + + out = ds.polyfit("time", 2) + assert len(out.data_vars) == 0 + + def test_pad(self): + ds = create_test_data(seed=1) + padded = ds.pad(dim2=(1, 1), constant_values=42) + + assert padded["dim2"].shape == (11,) + assert padded["var1"].shape == (8, 11) + assert padded["var2"].shape == (8, 11) + assert padded["var3"].shape == (10, 8) + assert dict(padded.dims) == {"dim1": 8, "dim2": 11, "dim3": 10, "time": 20} + + np.testing.assert_equal(padded["var1"].isel(dim2=[0, -1]).data, 42) + np.testing.assert_equal(padded["dim2"][[0, -1]].data, np.nan) + # Py.test tests @@ -5664,6 +5741,62 @@ def test_coarsen_coords_cftime(): np.testing.assert_array_equal(actual.time, expected_times) +def test_coarsen_keep_attrs(): + _attrs = {"units": "test", "long_name": "testing"} + + var1 = np.linspace(10, 15, 100) + var2 = np.linspace(5, 10, 100) + coords = np.linspace(1, 10, 100) + + ds = Dataset( + data_vars={"var1": ("coord", var1), "var2": ("coord", var2)}, + coords={"coord": coords}, + attrs=_attrs, + ) + + # Test dropped attrs + dat = ds.coarsen(coord=5).mean() + assert dat.attrs == {} + + # Test kept attrs using dataset keyword + dat = ds.coarsen(coord=5, keep_attrs=True).mean() + assert dat.attrs == _attrs + + # Test kept attrs using global option + with set_options(keep_attrs=True): + dat = ds.coarsen(coord=5).mean() + assert dat.attrs == _attrs + + +def test_rolling_keep_attrs(): + _attrs = {"units": "test", "long_name": "testing"} + + var1 = np.linspace(10, 15, 100) + var2 = np.linspace(5, 10, 100) + coords = np.linspace(1, 10, 100) + + ds = Dataset( + data_vars={"var1": ("coord", var1), "var2": ("coord", var2)}, + coords={"coord": coords}, + attrs=_attrs, + ) + + # Test dropped attrs + dat = ds.rolling(dim={"coord": 5}, min_periods=None, center=False).mean() + assert dat.attrs == {} + + # Test kept attrs using dataset keyword + dat = ds.rolling( + dim={"coord": 5}, min_periods=None, center=False, keep_attrs=True + ).mean() + assert dat.attrs == _attrs + + # Test kept attrs using global option + with set_options(keep_attrs=True): + dat = ds.rolling(dim={"coord": 5}, min_periods=None, center=False).mean() + assert dat.attrs == _attrs + + def test_rolling_properties(ds): # catching invalid args with pytest.raises(ValueError, match="exactly one dim/window should"): @@ -5950,7 +6083,7 @@ def test_integrate(dask): actual = da.integrate("x") # coordinate that contains x should be dropped. expected_x = xr.DataArray( - np.trapz(da, da["x"], axis=0), + np.trapz(da.compute(), da["x"], axis=0), dims=["y"], coords={k: v for k, v in da.coords.items() if "x" not in v.dims}, ) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index f4f11473e48..e61881cfce3 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -16,6 +16,7 @@ first, gradient, last, + least_squares, mean, np_timedelta64_to_float, pd_timedelta_to_float, @@ -279,6 +280,7 @@ def assert_dask_array(da, dask): @arm_xfail +@pytest.mark.filterwarnings("ignore::RuntimeWarning") @pytest.mark.parametrize("dask", [False, True] if has_dask else [False]) def test_datetime_mean(dask): # Note: only testing numpy, as dask is broken upstream @@ -760,3 +762,20 @@ def test_timedelta_to_numeric(td): out = timedelta_to_numeric(td, "ns") np.testing.assert_allclose(out, 86400 * 1e9) assert isinstance(out, float) + + +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_least_squares(use_dask, skipna): + if use_dask and not has_dask: + pytest.skip("requires dask") + lhs = np.array([[1, 2], [1, 2], [3, 2]]) + rhs = DataArray(np.array([3, 5, 7]), dims=("y",)) + + if use_dask: + rhs = rhs.chunk({"y": 1}) + + coeffs, residuals = least_squares(lhs, rhs.data, skipna=skipna) + + np.testing.assert_allclose(coeffs, [1.5, 1.25]) + np.testing.assert_allclose(residuals, [2.0]) diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 61ecf46b79b..6881c0bc0ff 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -115,7 +115,7 @@ def test_format_items(self): def test_format_array_flat(self): actual = formatting.format_array_flat(np.arange(100), 2) - expected = "0 ... 99" + expected = "..." assert expected == actual actual = formatting.format_array_flat(np.arange(100), 9) @@ -134,11 +134,13 @@ def test_format_array_flat(self): expected = "0 1 2 ... 98 99" assert expected == actual + # NB: Probably not ideal; an alternative would be cutting after the + # first ellipsis actual = formatting.format_array_flat(np.arange(100.0), 11) - expected = "0.0 ... 99.0" + expected = "0.0 ... ..." assert expected == actual - actual = formatting.format_array_flat(np.arange(100.0), 1) + actual = formatting.format_array_flat(np.arange(100.0), 12) expected = "0.0 ... 99.0" assert expected == actual @@ -154,16 +156,25 @@ def test_format_array_flat(self): expected = "" assert expected == actual - actual = formatting.format_array_flat(np.arange(1), 0) + actual = formatting.format_array_flat(np.arange(1), 1) expected = "0" assert expected == actual - actual = formatting.format_array_flat(np.arange(2), 0) + actual = formatting.format_array_flat(np.arange(2), 3) expected = "0 1" assert expected == actual - actual = formatting.format_array_flat(np.arange(4), 0) - expected = "0 ... 3" + actual = formatting.format_array_flat(np.arange(4), 7) + expected = "0 1 2 3" + assert expected == actual + + actual = formatting.format_array_flat(np.arange(5), 7) + expected = "0 ... 4" + assert expected == actual + + long_str = [" ".join(["hello world" for _ in range(100)])] + actual = formatting.format_array_flat(np.asarray([long_str]), 21) + expected = "'hello world hello..." assert expected == actual def test_pretty_print(self): diff --git a/xarray/tests/test_formatting_html.py b/xarray/tests/test_formatting_html.py index fea24ff93f8..239f339208d 100644 --- a/xarray/tests/test_formatting_html.py +++ b/xarray/tests/test_formatting_html.py @@ -51,6 +51,11 @@ def test_short_data_repr_html(dataarray): assert data_repr.startswith("array") +def test_short_data_repr_html_non_str_keys(dataset): + ds = dataset.assign({2: lambda x: x["tmin"]}) + fh.dataset_repr(ds) + + def test_short_data_repr_html_dask(dask_dataarray): import dask @@ -130,5 +135,5 @@ def test_repr_of_dataset(dataset): assert ( formatted.count("class='xr-section-summary-in' type='checkbox' checked>") == 3 ) - assert "<U4" in formatted + assert "<U4" in formatted or ">U4" in formatted assert "<IA>" in formatted diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 77558e741be..866d5fb0899 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -107,6 +107,39 @@ def test_groupby_input_mutation(): assert_identical(array, array_copy) # should not modify inputs +@pytest.mark.parametrize( + "obj", + [ + xr.DataArray([1, 2, 3, 4, 5, 6], [("x", [1, 1, 1, 2, 2, 2])]), + xr.Dataset({"foo": ("x", [1, 2, 3, 4, 5, 6])}, {"x": [1, 1, 1, 2, 2, 2]}), + ], +) +def test_groupby_map_shrink_groups(obj): + expected = obj.isel(x=[0, 1, 3, 4]) + actual = obj.groupby("x").map(lambda f: f.isel(x=[0, 1])) + assert_identical(expected, actual) + + +@pytest.mark.parametrize( + "obj", + [ + xr.DataArray([1, 2, 3], [("x", [1, 2, 2])]), + xr.Dataset({"foo": ("x", [1, 2, 3])}, {"x": [1, 2, 2]}), + ], +) +def test_groupby_map_change_group_size(obj): + def func(group): + if group.sizes["x"] == 1: + result = group.isel(x=[0, 0]) + else: + result = group.isel(x=[0]) + return result + + expected = obj.isel(x=[0, 0, 1]) + actual = obj.groupby("x").map(func) + assert_identical(expected, actual) + + def test_da_groupby_map_func_args(): def func(arg1, arg2, arg3=0): return arg1 + arg2 + arg3 @@ -414,7 +447,8 @@ def test_groupby_drops_nans(): # reduction operation along a different dimension actual = grouped.mean("time") - expected = ds.mean("time").where(ds.id.notnull()) + with pytest.warns(RuntimeWarning): # mean of empty slice + expected = ds.mean("time").where(ds.id.notnull()) assert_identical(actual, expected) # NaN in non-dimensional coordinate diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index c2bec2166c8..0502348160e 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -244,6 +244,36 @@ def test_interpolate_nd(case): assert_allclose(actual.transpose("y", "z"), expected) +@requires_scipy +def test_interpolate_nd_nd(): + """Interpolate nd array with an nd indexer sharing coordinates.""" + # Create original array + a = [0, 2] + x = [0, 1, 2] + da = xr.DataArray( + np.arange(6).reshape(2, 3), dims=("a", "x"), coords={"a": a, "x": x} + ) + + # Create indexer into `a` with dimensions (y, x) + y = [10] + c = {"x": x, "y": y} + ia = xr.DataArray([[1, 2, 2]], dims=("y", "x"), coords=c) + out = da.interp(a=ia) + expected = xr.DataArray([[1.5, 4, 5]], dims=("y", "x"), coords=c) + xr.testing.assert_allclose(out.drop_vars("a"), expected) + + # If the *shared* indexing coordinates do not match, interp should fail. + with pytest.raises(ValueError): + c = {"x": [1], "y": y} + ia = xr.DataArray([[1]], dims=("y", "x"), coords=c) + da.interp(a=ia) + + with pytest.raises(ValueError): + c = {"x": [5, 6, 7], "y": y} + ia = xr.DataArray([[1]], dims=("y", "x"), coords=c) + da.interp(a=ia) + + @pytest.mark.parametrize("method", ["linear"]) @pytest.mark.parametrize("case", [0, 1]) def test_interpolate_scalar(method, case): @@ -556,7 +586,6 @@ def test_datetime_single_string(): assert_allclose(actual.drop_vars("time"), expected) -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") @requires_cftime @requires_scipy def test_cftime(): @@ -583,7 +612,6 @@ def test_cftime_type_error(): da.interp(time=times_new) -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") @requires_cftime @requires_scipy def test_cftime_list_of_strings(): @@ -605,7 +633,6 @@ def test_cftime_list_of_strings(): assert_allclose(actual, expected) -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") @requires_cftime @requires_scipy def test_cftime_single_string(): @@ -667,7 +694,6 @@ def test_datetime_interp_noerror(): a.interp(x=xi, time=xi.time) # should not raise an error -@pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/3751") @requires_cftime def test_3641(): times = xr.cftime_range("0001", periods=3, freq="500Y") diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 6c8f3f65657..9057575b38c 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -3,6 +3,7 @@ import xarray as xr from xarray.core import dtypes, merge +from xarray.core.merge import MergeError from xarray.testing import assert_identical from . import raises_regex @@ -49,6 +50,65 @@ def test_merge_dataarray_unnamed(self): with raises_regex(ValueError, "without providing an explicit name"): xr.merge([data]) + def test_merge_arrays_attrs_default(self): + var1_attrs = {"a": 1, "b": 2} + var2_attrs = {"a": 1, "c": 3} + expected_attrs = {} + + data = create_test_data() + data.var1.attrs = var1_attrs + data.var2.attrs = var2_attrs + actual = xr.merge([data.var1, data.var2]) + expected = data[["var1", "var2"]] + expected.attrs = expected_attrs + assert actual.identical(expected) + + @pytest.mark.parametrize( + "combine_attrs, var1_attrs, var2_attrs, expected_attrs, " "expect_exception", + [ + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 1, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + False, + ), + ("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False), + ("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False), + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 4, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + True, + ), + ("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True), + ( + "override", + {"a": 1, "b": 2}, + {"a": 4, "b": 5, "c": 3}, + {"a": 1, "b": 2}, + False, + ), + ], + ) + def test_merge_arrays_attrs( + self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception + ): + data = create_test_data() + data.var1.attrs = var1_attrs + data.var2.attrs = var2_attrs + if expect_exception: + with raises_regex(MergeError, "combine_attrs"): + actual = xr.merge([data.var1, data.var2], combine_attrs=combine_attrs) + else: + actual = xr.merge([data.var1, data.var2], combine_attrs=combine_attrs) + expected = data[["var1", "var2"]] + expected.attrs = expected_attrs + assert actual.identical(expected) + def test_merge_dicts_simple(self): actual = xr.merge([{"foo": 0}, {"bar": "one"}, {"baz": 3.5}]) expected = xr.Dataset({"foo": 0, "bar": "one", "baz": 3.5}) diff --git a/xarray/tests/test_options.py b/xarray/tests/test_options.py index f155acbf494..19f74476ced 100644 --- a/xarray/tests/test_options.py +++ b/xarray/tests/test_options.py @@ -68,12 +68,12 @@ def test_nested_options(): def test_display_style(): - original = "text" + original = "html" assert OPTIONS["display_style"] == original with pytest.raises(ValueError): xarray.set_options(display_style="invalid_str") - with xarray.set_options(display_style="html"): - assert OPTIONS["display_style"] == "html" + with xarray.set_options(display_style="text"): + assert OPTIONS["display_style"] == "text" assert OPTIONS["display_style"] == original @@ -177,10 +177,11 @@ def test_merge_attr_retention(self): def test_display_style_text(self): ds = create_test_dataset_attrs() - text = ds._repr_html_() - assert text.startswith("
")
-        assert "'nested'" in text
-        assert "<xarray.Dataset>" in text
+        with xarray.set_options(display_style="text"):
+            text = ds._repr_html_()
+            assert text.startswith("
")
+            assert "'nested'" in text
+            assert "<xarray.Dataset>" in text
 
     def test_display_style_html(self):
         ds = create_test_dataset_attrs()
@@ -191,9 +192,10 @@ def test_display_style_html(self):
 
     def test_display_dataarray_style_text(self):
         da = create_test_dataarray_attrs()
-        text = da._repr_html_()
-        assert text.startswith("
")
-        assert "<xarray.DataArray 'var1'" in text
+        with xarray.set_options(display_style="text"):
+            text = da._repr_html_()
+            assert text.startswith("
")
+            assert "<xarray.DataArray 'var1'" in text
 
     def test_display_dataarray_style_html(self):
         da = create_test_dataarray_attrs()
diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py
index 9ffbcd9c85e..7f3f1620133 100644
--- a/xarray/tests/test_plot.py
+++ b/xarray/tests/test_plot.py
@@ -591,6 +591,10 @@ def setUp(self):
     def test_step(self):
         self.darray[0, 0].plot.step()
 
+    @pytest.mark.parametrize("ds", ["pre", "post", "mid"])
+    def test_step_with_drawstyle(self, ds):
+        self.darray[0, 0].plot.step(drawstyle=ds)
+
     def test_coord_with_interval_step(self):
         """Test step plot with intervals."""
         bins = [-1, 0, 1, 2]
@@ -1749,6 +1753,7 @@ def test_can_set_vmin_vmax(self):
             assert np.allclose(expected, clim)
 
     @pytest.mark.slow
+    @pytest.mark.filterwarnings("ignore")
     def test_can_set_norm(self):
         norm = mpl.colors.SymLogNorm(0.1)
         self.g.map_dataarray(xplt.imshow, "x", "y", norm=norm)
diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py
index 21a212c29b3..09ab1be9af9 100644
--- a/xarray/tests/test_sparse.py
+++ b/xarray/tests/test_sparse.py
@@ -175,7 +175,7 @@ def test_variable_property(prop):
             marks=xfail(reason="mixed sparse-dense operation"),
         ),
         param(
-            do("pad_with_fill_value", pad_widths={"x": (1, 1)}, fill_value=5),
+            do("pad", mode="constant", pad_widths={"x": (1, 1)}, fill_value=5),
             True,
             marks=xfail(reason="Missing implementation for np.pad"),
         ),
diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py
index 9f63ebb1d42..2826dc2479c 100644
--- a/xarray/tests/test_units.py
+++ b/xarray/tests/test_units.py
@@ -1,3 +1,4 @@
+import functools
 import operator
 from distutils.version import LooseVersion
 
@@ -8,8 +9,9 @@
 import xarray as xr
 from xarray.core import formatting
 from xarray.core.npcompat import IS_NEP18_ACTIVE
+from xarray.testing import assert_allclose, assert_identical
 
-from .test_variable import VariableSubclassobjects
+from .test_variable import _PAD_XR_NP_ARGS, VariableSubclassobjects
 
 pint = pytest.importorskip("pint")
 DimensionalityError = pint.errors.DimensionalityError
@@ -70,53 +72,17 @@ def array_strip_units(array):
         return array
 
 
-def array_attach_units(data, unit, convert_from=None):
-    try:
-        unit, convert_from = unit
-    except TypeError:
-        pass
-
+def array_attach_units(data, unit):
     if isinstance(data, Quantity):
-        if not convert_from:
-            raise ValueError(
-                "cannot attach unit {unit} to quantity ({data.units})".format(
-                    unit=unit, data=data
-                )
-            )
-        elif isinstance(convert_from, unit_registry.Unit):
-            data = data.magnitude
-        elif convert_from is True:  # intentionally accept exactly true
-            if data.check(unit):
-                convert_from = data.units
-                data = data.magnitude
-            else:
-                raise ValueError(
-                    "cannot convert quantity ({data.units}) to {unit}".format(
-                        unit=unit, data=data
-                    )
-                )
-        else:
-            raise ValueError(
-                "cannot convert from invalid unit {convert_from}".format(
-                    convert_from=convert_from
-                )
-            )
+        raise ValueError(f"cannot attach unit {unit} to quantity {data}")
 
-    # to make sure we also encounter the case of "equal if converted"
-    if convert_from is not None:
-        quantity = (data * convert_from).to(
-            unit
-            if isinstance(unit, unit_registry.Unit)
-            else unit_registry.dimensionless
-        )
-    else:
-        try:
-            quantity = data * unit
-        except np.core._exceptions.UFuncTypeError:
-            if unit != 1:
-                raise
+    try:
+        quantity = data * unit
+    except np.core._exceptions.UFuncTypeError:
+        if isinstance(unit, unit_registry.Unit):
+            raise
 
-            quantity = data
+        quantity = data
 
     return quantity
 
@@ -241,6 +207,11 @@ def attach_units(obj, units):
 
 
 def convert_units(obj, to):
+    # preprocess
+    to = {
+        key: None if not isinstance(value, unit_registry.Unit) else value
+        for key, value in to.items()
+    }
     if isinstance(obj, xr.Dataset):
         data_vars = {
             name: convert_units(array.variable, {None: to.get(name)})
@@ -282,6 +253,7 @@ def convert_units(obj, to):
 
 
 def assert_units_equal(a, b):
+    __tracebackhide__ = True
     assert extract_units(a) == extract_units(b)
 
 
@@ -414,9 +386,8 @@ def __repr__(self):
         return f"function_{self.name}"
 
 
-@pytest.mark.xfail(reason="test bug: apply_ufunc should not be called that way")
 def test_apply_ufunc_dataarray(dtype):
-    func = function(
+    func = functools.partial(
         xr.apply_ufunc, np.mean, input_core_dims=[["x"]], kwargs={"axis": -1}
     )
 
@@ -427,12 +398,12 @@ def test_apply_ufunc_dataarray(dtype):
     expected = attach_units(func(strip_units(data_array)), extract_units(data_array))
     actual = func(data_array)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
-@pytest.mark.xfail(reason="test bug: apply_ufunc should not be called that way")
 def test_apply_ufunc_dataset(dtype):
-    func = function(
+    func = functools.partial(
         xr.apply_ufunc, np.mean, input_core_dims=[["x"]], kwargs={"axis": -1}
     )
 
@@ -450,10 +421,10 @@ def test_apply_ufunc_dataset(dtype):
     expected = attach_units(func(strip_units(ds)), extract_units(ds))
     actual = func(ds)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
-@pytest.mark.xfail(reason="blocked by `reindex` / `where`")
 @pytest.mark.parametrize(
     "unit,error",
     (
@@ -475,36 +446,40 @@ def test_apply_ufunc_dataset(dtype):
         "coords",
     ),
 )
-@pytest.mark.parametrize("fill_value", (np.float64(10), np.float64(np.nan)))
+@pytest.mark.parametrize("fill_value", (10, np.nan))
 def test_align_dataarray(fill_value, variant, unit, error, dtype):
     original_unit = unit_registry.m
 
     variants = {
-        "data": (unit, 1, 1),
-        "dims": (original_unit, unit, 1),
-        "coords": (original_unit, 1, unit),
+        "data": (unit, original_unit, original_unit),
+        "dims": (original_unit, unit, original_unit),
+        "coords": (original_unit, original_unit, unit),
     }
     data_unit, dim_unit, coord_unit = variants.get(variant)
 
     array1 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * original_unit
     array2 = np.linspace(0, 8, 2 * 5).reshape(2, 5).astype(dtype) * data_unit
     x = np.arange(2) * original_unit
-    x_a1 = np.array([10, 5]) * original_unit
-    x_a2 = np.array([10, 5]) * coord_unit
 
     y1 = np.arange(5) * original_unit
     y2 = np.arange(2, 7) * dim_unit
+    y_a1 = np.array([3, 5, 7, 8, 9]) * original_unit
+    y_a2 = np.array([7, 8, 9, 11, 13]) * coord_unit
 
-    data_array1 = xr.DataArray(
-        data=array1, coords={"x": x, "x_a": ("x", x_a1), "y": y1}, dims=("x", "y")
-    )
-    data_array2 = xr.DataArray(
-        data=array2, coords={"x": x, "x_a": ("x", x_a2), "y": y2}, dims=("x", "y")
-    )
+    coords1 = {"x": x, "y": y1}
+    coords2 = {"x": x, "y": y2}
+    if variant == "coords":
+        coords1["y_a"] = ("y", y_a1)
+        coords2["y_a"] = ("y", y_a2)
+
+    data_array1 = xr.DataArray(data=array1, coords=coords1, dims=("x", "y"))
+    data_array2 = xr.DataArray(data=array2, coords=coords2, dims=("x", "y"))
 
     fill_value = fill_value * data_unit
     func = function(xr.align, join="outer", fill_value=fill_value)
-    if error is not None:
+    if error is not None and not (
+        np.isnan(fill_value) and not isinstance(fill_value, Quantity)
+    ):
         with pytest.raises(error):
             func(data_array1, data_array2)
 
@@ -524,15 +499,19 @@ def test_align_dataarray(fill_value, variant, unit, error, dtype):
         **stripped_kwargs,
     )
     expected_a = attach_units(expected_a, units_a)
-    expected_b = convert_units(attach_units(expected_b, units_a), units_b)
+    if isinstance(array2, Quantity):
+        expected_b = convert_units(attach_units(expected_b, units_a), units_b)
+    else:
+        expected_b = attach_units(expected_b, units_b)
 
     actual_a, actual_b = func(data_array1, data_array2)
 
-    assert_equal_with_units(expected_a, actual_a)
-    assert_equal_with_units(expected_b, actual_b)
+    assert_units_equal(expected_a, actual_a)
+    assert_allclose(expected_a, actual_a)
+    assert_units_equal(expected_b, actual_b)
+    assert_allclose(expected_b, actual_b)
 
 
-@pytest.mark.xfail(reason="blocked by `reindex` / `where`")
 @pytest.mark.parametrize(
     "unit,error",
     (
@@ -558,31 +537,37 @@ def test_align_dataarray(fill_value, variant, unit, error, dtype):
 def test_align_dataset(fill_value, unit, variant, error, dtype):
     original_unit = unit_registry.m
 
-    variants = {"data": (unit, 1, 1), "dims": (1, unit, 1), "coords": (1, 1, unit)}
+    variants = {
+        "data": (unit, original_unit, original_unit),
+        "dims": (original_unit, unit, original_unit),
+        "coords": (original_unit, original_unit, unit),
+    }
     data_unit, dim_unit, coord_unit = variants.get(variant)
 
     array1 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * original_unit
     array2 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * data_unit
 
     x = np.arange(2) * original_unit
-    x_a1 = np.array([10, 5]) * original_unit
-    x_a2 = np.array([10, 5]) * coord_unit
 
     y1 = np.arange(5) * original_unit
     y2 = np.arange(2, 7) * dim_unit
+    y_a1 = np.array([3, 5, 7, 8, 9]) * original_unit
+    y_a2 = np.array([7, 8, 9, 11, 13]) * coord_unit
 
-    ds1 = xr.Dataset(
-        data_vars={"a": (("x", "y"), array1)},
-        coords={"x": x, "x_a": ("x", x_a1), "y": y1},
-    )
-    ds2 = xr.Dataset(
-        data_vars={"a": (("x", "y"), array2)},
-        coords={"x": x, "x_a": ("x", x_a2), "y": y2},
-    )
+    coords1 = {"x": x, "y": y1}
+    coords2 = {"x": x, "y": y2}
+    if variant == "coords":
+        coords1["y_a"] = ("y", y_a1)
+        coords2["y_a"] = ("y", y_a2)
+
+    ds1 = xr.Dataset(data_vars={"a": (("x", "y"), array1)}, coords=coords1)
+    ds2 = xr.Dataset(data_vars={"a": (("x", "y"), array2)}, coords=coords2)
 
     fill_value = fill_value * data_unit
     func = function(xr.align, join="outer", fill_value=fill_value)
-    if error is not None:
+    if error is not None and not (
+        np.isnan(fill_value) and not isinstance(fill_value, Quantity)
+    ):
         with pytest.raises(error):
             func(ds1, ds2)
 
@@ -600,12 +585,17 @@ def test_align_dataset(fill_value, unit, variant, error, dtype):
         strip_units(ds1), strip_units(convert_units(ds2, units_a)), **stripped_kwargs
     )
     expected_a = attach_units(expected_a, units_a)
-    expected_b = convert_units(attach_units(expected_b, units_a), units_b)
+    if isinstance(array2, Quantity):
+        expected_b = convert_units(attach_units(expected_b, units_a), units_b)
+    else:
+        expected_b = attach_units(expected_b, units_b)
 
     actual_a, actual_b = func(ds1, ds2)
 
-    assert_equal_with_units(expected_a, actual_a)
-    assert_equal_with_units(expected_b, actual_b)
+    assert_units_equal(expected_a, actual_a)
+    assert_allclose(expected_a, actual_a)
+    assert_units_equal(expected_b, actual_b)
+    assert_allclose(expected_b, actual_b)
 
 
 def test_broadcast_dataarray(dtype):
@@ -615,28 +605,53 @@ def test_broadcast_dataarray(dtype):
     a = xr.DataArray(data=array1, dims="x")
     b = xr.DataArray(data=array2, dims="y")
 
-    expected_a, expected_b = tuple(
-        attach_units(elem, extract_units(a))
-        for elem in xr.broadcast(strip_units(a), strip_units(b))
-    )
+    units_a = extract_units(a)
+    units_b = extract_units(b)
+    expected_a, expected_b = xr.broadcast(strip_units(a), strip_units(b))
+    expected_a = attach_units(expected_a, units_a)
+    expected_b = convert_units(attach_units(expected_b, units_a), units_b)
+
     actual_a, actual_b = xr.broadcast(a, b)
 
-    assert_equal_with_units(expected_a, actual_a)
-    assert_equal_with_units(expected_b, actual_b)
+    assert_units_equal(expected_a, actual_a)
+    assert_identical(expected_a, actual_a)
+    assert_units_equal(expected_b, actual_b)
+    assert_identical(expected_b, actual_b)
 
 
 def test_broadcast_dataset(dtype):
     array1 = np.linspace(0, 10, 2) * unit_registry.Pa
     array2 = np.linspace(0, 10, 3) * unit_registry.Pa
 
-    ds = xr.Dataset(data_vars={"a": ("x", array1), "b": ("y", array2)})
+    x1 = np.arange(2)
+    y1 = np.arange(3)
+
+    x2 = np.arange(2, 4)
+    y2 = np.arange(3, 6)
 
-    (expected,) = tuple(
-        attach_units(elem, extract_units(ds)) for elem in xr.broadcast(strip_units(ds))
+    ds = xr.Dataset(
+        data_vars={"a": ("x", array1), "b": ("y", array2)}, coords={"x": x1, "y": y1}
+    )
+    other = xr.Dataset(
+        data_vars={
+            "a": ("x", array1.to(unit_registry.hPa)),
+            "b": ("y", array2.to(unit_registry.hPa)),
+        },
+        coords={"x": x2, "y": y2},
     )
-    (actual,) = xr.broadcast(ds)
 
-    assert_equal_with_units(expected, actual)
+    units_a = extract_units(ds)
+    units_b = extract_units(other)
+    expected_a, expected_b = xr.broadcast(strip_units(ds), strip_units(other))
+    expected_a = attach_units(expected_a, units_a)
+    expected_b = attach_units(expected_b, units_b)
+
+    actual_a, actual_b = xr.broadcast(ds, other)
+
+    assert_units_equal(expected_a, actual_a)
+    assert_identical(expected_a, actual_a)
+    assert_units_equal(expected_b, actual_b)
+    assert_identical(expected_b, actual_b)
 
 
 @pytest.mark.parametrize(
@@ -706,7 +721,8 @@ def test_combine_by_coords(variant, unit, error, dtype):
     )
     actual = xr.combine_by_coords([ds, other])
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -717,12 +733,7 @@ def test_combine_by_coords(variant, unit, error, dtype):
             unit_registry.dimensionless, DimensionalityError, id="dimensionless"
         ),
         pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"),
-        pytest.param(
-            unit_registry.mm,
-            None,
-            id="compatible_unit",
-            marks=pytest.mark.xfail(reason="wrong order of arguments to `where`"),
-        ),
+        pytest.param(unit_registry.mm, None, id="compatible_unit"),
         pytest.param(unit_registry.m, None, id="identical_unit"),
     ),
     ids=repr,
@@ -810,7 +821,8 @@ def test_combine_nested(variant, unit, error, dtype):
     )
     actual = func([[ds1, ds2], [ds3, ds4]])
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -862,7 +874,8 @@ def test_concat_dataarray(variant, unit, error, dtype):
     )
     actual = xr.concat([arr1, arr2], dim="x")
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -912,10 +925,10 @@ def test_concat_dataset(variant, unit, error, dtype):
     )
     actual = xr.concat([ds1, ds2], dim="x")
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
-@pytest.mark.xfail(reason="blocked by `reindex` / `where`")
 @pytest.mark.parametrize(
     "unit,error",
     (
@@ -948,64 +961,81 @@ def test_merge_dataarray(variant, unit, error, dtype):
     data_unit, dim_unit, coord_unit = variants.get(variant)
 
     array1 = np.linspace(0, 1, 2 * 3).reshape(2, 3).astype(dtype) * original_unit
+    x1 = np.arange(2) * original_unit
+    y1 = np.arange(3) * original_unit
+    u1 = np.linspace(10, 20, 2) * original_unit
+    v1 = np.linspace(10, 20, 3) * original_unit
+
     array2 = np.linspace(1, 2, 2 * 4).reshape(2, 4).astype(dtype) * data_unit
-    array3 = np.linspace(0, 2, 3 * 4).reshape(3, 4).astype(dtype) * data_unit
+    x2 = np.arange(2, 4) * dim_unit
+    z2 = np.arange(4) * original_unit
+    u2 = np.linspace(20, 30, 2) * coord_unit
+    w2 = np.linspace(10, 20, 4) * original_unit
 
-    x = np.arange(2) * original_unit
-    y = np.arange(3) * original_unit
-    z = np.arange(4) * original_unit
-    u = np.linspace(10, 20, 2) * original_unit
-    v = np.linspace(10, 20, 3) * original_unit
-    w = np.linspace(10, 20, 4) * original_unit
+    array3 = np.linspace(0, 2, 3 * 4).reshape(3, 4).astype(dtype) * data_unit
+    y3 = np.arange(3, 6) * dim_unit
+    z3 = np.arange(4, 8) * dim_unit
+    v3 = np.linspace(10, 20, 3) * coord_unit
+    w3 = np.linspace(10, 20, 4) * coord_unit
 
     arr1 = xr.DataArray(
         name="a",
         data=array1,
-        coords={"x": x, "y": y, "u": ("x", u), "v": ("y", v)},
+        coords={"x": x1, "y": y1, "u": ("x", u1), "v": ("y", v1)},
         dims=("x", "y"),
     )
     arr2 = xr.DataArray(
-        name="b",
+        name="a",
         data=array2,
-        coords={
-            "x": np.arange(2, 4) * dim_unit,
-            "z": z,
-            "u": ("x", np.linspace(20, 30, 2) * coord_unit),
-            "w": ("z", w),
-        },
+        coords={"x": x2, "z": z2, "u": ("x", u2), "w": ("z", w2)},
         dims=("x", "z"),
     )
     arr3 = xr.DataArray(
-        name="c",
+        name="a",
         data=array3,
-        coords={
-            "y": np.arange(3, 6) * dim_unit,
-            "z": np.arange(4, 8) * dim_unit,
-            "v": ("y", np.linspace(10, 20, 3) * coord_unit),
-            "w": ("z", np.linspace(10, 20, 4) * coord_unit),
-        },
+        coords={"y": y3, "z": z3, "v": ("y", v3), "w": ("z", w3)},
         dims=("y", "z"),
     )
 
-    func = function(xr.merge)
     if error is not None:
         with pytest.raises(error):
-            func([arr1, arr2, arr3])
+            xr.merge([arr1, arr2, arr3])
 
         return
 
-    units = {name: original_unit for name in list("abcuvwxyz")}
+    units = {name: original_unit for name in list("axyzuvw")}
+
     convert_and_strip = lambda arr: strip_units(convert_units(arr, units))
-    expected = attach_units(
-        func([strip_units(arr1), convert_and_strip(arr2), convert_and_strip(arr3)]),
-        units,
+    expected_units = {
+        "a": original_unit,
+        "u": original_unit,
+        "v": original_unit,
+        "w": original_unit,
+        "x": original_unit,
+        "y": original_unit,
+        "z": original_unit,
+    }
+
+    expected = convert_units(
+        attach_units(
+            xr.merge(
+                [
+                    convert_and_strip(arr1),
+                    convert_and_strip(arr2),
+                    convert_and_strip(arr3),
+                ]
+            ),
+            units,
+        ),
+        expected_units,
     )
-    actual = func([arr1, arr2, arr3])
 
-    assert_equal_with_units(expected, actual)
+    actual = xr.merge([arr1, arr2, arr3])
+
+    assert_units_equal(expected, actual)
+    assert_allclose(expected, actual)
 
 
-@pytest.mark.xfail(reason="blocked by `reindex` / `where`")
 @pytest.mark.parametrize(
     "unit,error",
     (
@@ -1046,7 +1076,7 @@ def test_merge_dataset(variant, unit, error, dtype):
 
     ds1 = xr.Dataset(
         data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)},
-        coords={"x": x, "y": y, "z": ("x", z)},
+        coords={"x": x, "y": y, "u": ("x", z)},
     )
     ds2 = xr.Dataset(
         data_vars={
@@ -1056,18 +1086,18 @@ def test_merge_dataset(variant, unit, error, dtype):
         coords={
             "x": np.arange(3) * dim_unit,
             "y": np.arange(2, 4) * dim_unit,
-            "z": ("x", np.arange(-3, 0) * coord_unit),
+            "u": ("x", np.arange(-3, 0) * coord_unit),
         },
     )
     ds3 = xr.Dataset(
         data_vars={
-            "a": (("y", "x"), np.zeros_like(array1) * np.nan * data_unit),
-            "b": (("y", "x"), np.zeros_like(array2) * np.nan * data_unit),
+            "a": (("y", "x"), np.full_like(array1, np.nan) * data_unit),
+            "b": (("y", "x"), np.full_like(array2, np.nan) * data_unit),
         },
         coords={
             "x": np.arange(3, 6) * dim_unit,
             "y": np.arange(4, 6) * dim_unit,
-            "z": ("x", np.arange(3, 6) * coord_unit),
+            "u": ("x", np.arange(3, 6) * coord_unit),
         },
     )
 
@@ -1080,12 +1110,20 @@ def test_merge_dataset(variant, unit, error, dtype):
 
     units = extract_units(ds1)
     convert_and_strip = lambda ds: strip_units(convert_units(ds, units))
-    expected = attach_units(
-        func([strip_units(ds1), convert_and_strip(ds2), convert_and_strip(ds3)]), units
+    expected_units = {name: original_unit for name in list("abxyzu")}
+    expected = convert_units(
+        attach_units(
+            func(
+                [convert_and_strip(ds1), convert_and_strip(ds2), convert_and_strip(ds3)]
+            ),
+            units,
+        ),
+        expected_units,
     )
     actual = func([ds1, ds2, ds3])
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_allclose(expected, actual)
 
 
 @pytest.mark.parametrize("func", (xr.zeros_like, xr.ones_like))
@@ -1094,10 +1132,12 @@ def test_replication_dataarray(func, dtype):
     data_array = xr.DataArray(data=array, dims="x")
 
     numpy_func = getattr(np, func.__name__)
-    expected = xr.DataArray(data=numpy_func(array), dims="x")
+    units = extract_units(numpy_func(data_array))
+    expected = attach_units(func(data_array), units)
     actual = func(data_array)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.parametrize("func", (xr.zeros_like, xr.ones_like))
@@ -1114,12 +1154,13 @@ def test_replication_dataset(func, dtype):
     )
 
     numpy_func = getattr(np, func.__name__)
-    expected = ds.copy(
-        data={name: numpy_func(array.data) for name, array in ds.data_vars.items()}
-    )
+    units = extract_units(ds.map(numpy_func))
+    expected = attach_units(func(strip_units(ds)), units)
+
     actual = func(ds)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.xfail(
@@ -1158,7 +1199,8 @@ def test_replication_full_like_dataarray(unit, error, dtype):
     )
     actual = xr.full_like(data_array, fill_value=fill_value)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.xfail(
@@ -1208,7 +1250,8 @@ def test_replication_full_like_dataset(unit, error, dtype):
     )
     actual = xr.full_like(ds, fill_value=fill_value)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -1250,7 +1293,8 @@ def test_where_dataarray(fill_value, unit, error, dtype):
     )
     actual = xr.where(cond, x, fill_value)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -1294,7 +1338,8 @@ def test_where_dataset(fill_value, unit, error, dtype):
     )
     actual = xr.where(cond, ds, fill_value)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 def test_dot_dataarray(dtype):
@@ -1315,7 +1360,8 @@ def test_dot_dataarray(dtype):
     )
     actual = xr.dot(data_array, other)
 
-    assert_equal_with_units(expected, actual)
+    assert_units_equal(expected, actual)
+    assert_identical(expected, actual)
 
 
 def delete_attrs(*to_delete):
@@ -2032,42 +2078,32 @@ def test_no_conflicts(self, unit, dtype):
 
         assert expected == actual
 
-    def test_pad(self, dtype):
+    @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS)
+    def test_pad_constant_values(self, dtype, xr_arg, np_arg):
         data = np.arange(4 * 3 * 2).reshape(4, 3, 2).astype(dtype) * unit_registry.m
         v = xr.Variable(["x", "y", "z"], data)
 
-        xr_args = [{"x": (2, 1)}, {"y": (0, 3)}, {"x": (3, 1), "z": (2, 0)}]
-        np_args = [
-            ((2, 1), (0, 0), (0, 0)),
-            ((0, 0), (0, 3), (0, 0)),
-            ((3, 1), (0, 0), (2, 0)),
-        ]
-        for xr_arg, np_arg in zip(xr_args, np_args):
-            actual = v.pad_with_fill_value(**xr_arg)
-            expected = xr.Variable(
-                v.dims,
-                np.pad(
-                    v.data.astype(float),
-                    np_arg,
-                    mode="constant",
-                    constant_values=np.nan,
-                ),
-            )
-            xr.testing.assert_identical(expected, actual)
-            assert_units_equal(expected, actual)
-            assert isinstance(actual._data, type(v._data))
+        actual = v.pad(**xr_arg, mode="constant")
+        expected = xr.Variable(
+            v.dims,
+            np.pad(
+                v.data.astype(float), np_arg, mode="constant", constant_values=np.nan,
+            ),
+        )
+        xr.testing.assert_identical(expected, actual)
+        assert_units_equal(expected, actual)
+        assert isinstance(actual._data, type(v._data))
 
         # for the boolean array, we pad False
         data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2)
         v = xr.Variable(["x", "y", "z"], data)
-        for xr_arg, np_arg in zip(xr_args, np_args):
-            actual = v.pad_with_fill_value(fill_value=data.flat[0], **xr_arg)
-            expected = xr.Variable(
-                v.dims,
-                np.pad(v.data, np_arg, mode="constant", constant_values=v.data.flat[0]),
-            )
-            xr.testing.assert_identical(actual, expected)
-            assert_units_equal(expected, actual)
+        actual = v.pad(**xr_arg, mode="constant", constant_values=data.flat[0])
+        expected = xr.Variable(
+            v.dims,
+            np.pad(v.data, np_arg, mode="constant", constant_values=v.data.flat[0]),
+        )
+        xr.testing.assert_identical(actual, expected)
+        assert_units_equal(expected, actual)
 
     @pytest.mark.parametrize(
         "unit,error",
@@ -2089,16 +2125,16 @@ def test_pad(self, dtype):
             pytest.param(unit_registry.m, None, id="identical_unit"),
         ),
     )
-    def test_pad_with_fill_value(self, unit, error, dtype):
+    def test_pad_unit_constant_value(self, unit, error, dtype):
         array = np.linspace(0, 5, 3 * 10).reshape(3, 10).astype(dtype) * unit_registry.m
         variable = xr.Variable(("x", "y"), array)
 
         fill_value = -100 * unit
 
-        func = method("pad_with_fill_value", x=(2, 3), y=(1, 4))
+        func = method("pad", mode="constant", x=(2, 3), y=(1, 4))
         if error is not None:
             with pytest.raises(error):
-                func(variable, fill_value=fill_value)
+                func(variable, constant_values=fill_value)
 
             return
 
@@ -2106,11 +2142,11 @@ def test_pad_with_fill_value(self, unit, error, dtype):
         expected = attach_units(
             func(
                 strip_units(variable),
-                fill_value=strip_units(convert_units(fill_value, units)),
+                constant_values=strip_units(convert_units(fill_value, units)),
             ),
             units,
         )
-        actual = func(variable, fill_value=fill_value)
+        actual = func(variable, constant_values=fill_value)
 
         assert_units_equal(expected, actual)
         xr.testing.assert_identical(expected, actual)
diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py
index af87b94393d..5f8b1770bd3 100644
--- a/xarray/tests/test_utils.py
+++ b/xarray/tests/test_utils.py
@@ -9,7 +9,7 @@
 from xarray.core import duck_array_ops, utils
 from xarray.core.utils import either_dict_or_kwargs
 
-from . import assert_array_equal, requires_cftime, requires_dask
+from . import assert_array_equal, raises_regex, requires_cftime, requires_dask
 from .test_coding_times import _all_cftime_date_types
 
 
@@ -120,9 +120,18 @@ def test_unsafe(self):
         with pytest.raises(ValueError):
             utils.update_safety_check(self.x, self.z)
 
-    def test_ordered_dict_intersection(self):
-        assert {"b": "B"} == utils.ordered_dict_intersection(self.x, self.y)
-        assert {} == utils.ordered_dict_intersection(self.x, self.z)
+    def test_compat_dict_intersection(self):
+        assert {"b": "B"} == utils.compat_dict_intersection(self.x, self.y)
+        assert {} == utils.compat_dict_intersection(self.x, self.z)
+
+    def test_compat_dict_union(self):
+        assert {"a": "A", "b": "B", "c": "C"} == utils.compat_dict_union(self.x, self.y)
+        with raises_regex(
+            ValueError,
+            "unsafe to merge dictionaries without "
+            "overriding values; conflicting key",
+        ):
+            utils.compat_dict_union(self.x, self.z)
 
     def test_dict_equiv(self):
         x = {}
diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py
index 62fde920b1e..116466e112d 100644
--- a/xarray/tests/test_variable.py
+++ b/xarray/tests/test_variable.py
@@ -9,7 +9,7 @@
 import pytz
 
 from xarray import Coordinate, Dataset, IndexVariable, Variable, set_options
-from xarray.core import dtypes, indexing
+from xarray.core import dtypes, duck_array_ops, indexing
 from xarray.core.common import full_like, ones_like, zeros_like
 from xarray.core.indexing import (
     BasicIndexer,
@@ -38,6 +38,14 @@
     source_ndarray,
 )
 
+_PAD_XR_NP_ARGS = [
+    [{"x": (2, 1)}, ((2, 1), (0, 0), (0, 0))],
+    [{"x": 1}, ((1, 1), (0, 0), (0, 0))],
+    [{"y": (0, 3)}, ((0, 0), (0, 3), (0, 0))],
+    [{"x": (3, 1), "z": (2, 0)}, ((3, 1), (0, 0), (2, 0))],
+    [{"x": (3, 1), "z": 2}, ((3, 1), (0, 0), (2, 2))],
+]
+
 
 class VariableSubclassobjects:
     def test_properties(self):
@@ -530,8 +538,7 @@ def test_copy_index_with_data(self):
         orig = IndexVariable("x", np.arange(5))
         new_data = np.arange(5, 10)
         actual = orig.copy(data=new_data)
-        expected = orig.copy()
-        expected.data = new_data
+        expected = IndexVariable("x", np.arange(5, 10))
         assert_identical(expected, actual)
 
     def test_copy_index_with_data_errors(self):
@@ -539,6 +546,10 @@ def test_copy_index_with_data_errors(self):
         new_data = np.arange(5, 20)
         with raises_regex(ValueError, "must match shape of object"):
             orig.copy(data=new_data)
+        with raises_regex(ValueError, "Cannot assign to the .data"):
+            orig.data = new_data
+        with raises_regex(ValueError, "Cannot assign to the .values"):
+            orig.values = new_data
 
     def test_replace(self):
         var = Variable(("x", "y"), [[1.5, 2.0], [3.1, 4.3]], {"foo": "bar"})
@@ -785,36 +796,65 @@ def test_getitem_error(self):
         with raises_regex(IndexError, "Dimensions of indexers mis"):
             v[:, ind]
 
-    def test_pad(self):
+    @pytest.mark.parametrize(
+        "mode",
+        [
+            "mean",
+            pytest.param(
+                "median",
+                marks=pytest.mark.xfail(reason="median is not implemented by Dask"),
+            ),
+            pytest.param(
+                "reflect", marks=pytest.mark.xfail(reason="dask.array.pad bug")
+            ),
+            "edge",
+            pytest.param(
+                "linear_ramp",
+                marks=pytest.mark.xfail(
+                    reason="pint bug: https://github.com/hgrecco/pint/issues/1026"
+                ),
+            ),
+            "maximum",
+            "minimum",
+            "symmetric",
+            "wrap",
+        ],
+    )
+    @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS)
+    def test_pad(self, mode, xr_arg, np_arg):
         data = np.arange(4 * 3 * 2).reshape(4, 3, 2)
         v = self.cls(["x", "y", "z"], data)
 
-        xr_args = [{"x": (2, 1)}, {"y": (0, 3)}, {"x": (3, 1), "z": (2, 0)}]
-        np_args = [
-            ((2, 1), (0, 0), (0, 0)),
-            ((0, 0), (0, 3), (0, 0)),
-            ((3, 1), (0, 0), (2, 0)),
-        ]
-        for xr_arg, np_arg in zip(xr_args, np_args):
-            actual = v.pad_with_fill_value(**xr_arg)
-            expected = np.pad(
-                np.array(v.data.astype(float)),
-                np_arg,
-                mode="constant",
-                constant_values=np.nan,
-            )
-            assert_array_equal(actual, expected)
-            assert isinstance(actual._data, type(v._data))
+        actual = v.pad(mode=mode, **xr_arg)
+        expected = np.pad(data, np_arg, mode=mode)
+
+        assert_array_equal(actual, expected)
+        assert isinstance(actual._data, type(v._data))
+
+    @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS)
+    def test_pad_constant_values(self, xr_arg, np_arg):
+        data = np.arange(4 * 3 * 2).reshape(4, 3, 2)
+        v = self.cls(["x", "y", "z"], data)
+
+        actual = v.pad(**xr_arg)
+        expected = np.pad(
+            np.array(v.data.astype(float)),
+            np_arg,
+            mode="constant",
+            constant_values=np.nan,
+        )
+        assert_array_equal(actual, expected)
+        assert isinstance(actual._data, type(v._data))
 
         # for the boolean array, we pad False
         data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2)
         v = self.cls(["x", "y", "z"], data)
-        for xr_arg, np_arg in zip(xr_args, np_args):
-            actual = v.pad_with_fill_value(fill_value=False, **xr_arg)
-            expected = np.pad(
-                np.array(v.data), np_arg, mode="constant", constant_values=False
-            )
-            assert_array_equal(actual, expected)
+
+        actual = v.pad(mode="constant", constant_values=False, **xr_arg)
+        expected = np.pad(
+            np.array(v.data), np_arg, mode="constant", constant_values=False
+        )
+        assert_array_equal(actual, expected)
 
     def test_rolling_window(self):
         # Just a working test. See test_nputils for the algorithm validation
@@ -1511,14 +1551,16 @@ def test_reduce(self):
         with pytest.warns(DeprecationWarning, match="allow_lazy is deprecated"):
             v.mean(dim="x", allow_lazy=False)
 
+    @pytest.mark.parametrize("skipna", [True, False])
     @pytest.mark.parametrize("q", [0.25, [0.50], [0.25, 0.75]])
     @pytest.mark.parametrize(
         "axis, dim", zip([None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]])
     )
-    def test_quantile(self, q, axis, dim):
+    def test_quantile(self, q, axis, dim, skipna):
         v = Variable(["x", "y"], self.d)
-        actual = v.quantile(q, dim=dim)
-        expected = np.nanpercentile(self.d, np.array(q) * 100, axis=axis)
+        actual = v.quantile(q, dim=dim, skipna=skipna)
+        _percentile_func = np.nanpercentile if skipna else np.percentile
+        expected = _percentile_func(self.d, np.array(q) * 100, axis=axis)
         np.testing.assert_allclose(actual.values, expected)
 
     @requires_dask
@@ -1879,6 +1921,26 @@ def test_coarsen_2d(self):
         expected = self.cls(("x", "y"), [[10, 18], [42, 35]])
         assert_equal(actual, expected)
 
+    # perhaps @pytest.mark.parametrize("operation", [f for f in duck_array_ops])
+    def test_coarsen_keep_attrs(self, operation="mean"):
+        _attrs = {"units": "test", "long_name": "testing"}
+
+        test_func = getattr(duck_array_ops, operation, None)
+
+        # Test dropped attrs
+        with set_options(keep_attrs=False):
+            new = Variable(["coord"], np.linspace(1, 10, 100), attrs=_attrs).coarsen(
+                windows={"coord": 1}, func=test_func, boundary="exact", side="left"
+            )
+        assert new.attrs == {}
+
+        # Test kept attrs
+        with set_options(keep_attrs=True):
+            new = Variable(["coord"], np.linspace(1, 10, 100), attrs=_attrs).coarsen(
+                windows={"coord": 1}, func=test_func, boundary="exact", side="left"
+            )
+        assert new.attrs == _attrs
+
 
 @requires_dask
 class TestVariableWithDask(VariableSubclassobjects):
@@ -2034,8 +2096,28 @@ def test_getitem_uint(self):
         super().test_getitem_fancy()
 
     @pytest.mark.xfail
-    def test_pad(self):
-        super().test_rolling_window()
+    @pytest.mark.parametrize(
+        "mode",
+        [
+            "mean",
+            "median",
+            "reflect",
+            "edge",
+            "linear_ramp",
+            "maximum",
+            "minimum",
+            "symmetric",
+            "wrap",
+        ],
+    )
+    @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS)
+    def test_pad(self, mode, xr_arg, np_arg):
+        super().test_pad(mode, xr_arg, np_arg)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS)
+    def test_pad_constant_values(self, xr_arg, np_arg):
+        super().test_pad_constant_values(xr_arg, np_arg)
 
     @pytest.mark.xfail
     def test_rolling_window(self):
diff --git a/xarray/tests/test_weighted.py b/xarray/tests/test_weighted.py
new file mode 100644
index 00000000000..24531215dfb
--- /dev/null
+++ b/xarray/tests/test_weighted.py
@@ -0,0 +1,311 @@
+import numpy as np
+import pytest
+
+import xarray as xr
+from xarray import DataArray
+from xarray.tests import assert_allclose, assert_equal, raises_regex
+
+
+@pytest.mark.parametrize("as_dataset", (True, False))
+def test_weighted_non_DataArray_weights(as_dataset):
+
+    data = DataArray([1, 2])
+    if as_dataset:
+        data = data.to_dataset(name="data")
+
+    with raises_regex(ValueError, "`weights` must be a DataArray"):
+        data.weighted([1, 2])
+
+
+@pytest.mark.parametrize("as_dataset", (True, False))
+@pytest.mark.parametrize("weights", ([np.nan, 2], [np.nan, np.nan]))
+def test_weighted_weights_nan_raises(as_dataset, weights):
+
+    data = DataArray([1, 2])
+    if as_dataset:
+        data = data.to_dataset(name="data")
+
+    with pytest.raises(ValueError, match="`weights` cannot contain missing values."):
+        data.weighted(DataArray(weights))
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"),
+    (([1, 2], 3), ([2, 0], 2), ([0, 0], np.nan), ([-1, 1], np.nan)),
+)
+def test_weighted_sum_of_weights_no_nan(weights, expected):
+
+    da = DataArray([1, 2])
+    weights = DataArray(weights)
+    result = da.weighted(weights).sum_of_weights()
+
+    expected = DataArray(expected)
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"),
+    (([1, 2], 2), ([2, 0], np.nan), ([0, 0], np.nan), ([-1, 1], 1)),
+)
+def test_weighted_sum_of_weights_nan(weights, expected):
+
+    da = DataArray([np.nan, 2])
+    weights = DataArray(weights)
+    result = da.weighted(weights).sum_of_weights()
+
+    expected = DataArray(expected)
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize("da", ([1.0, 2], [1, np.nan], [np.nan, np.nan]))
+@pytest.mark.parametrize("factor", [0, 1, 3.14])
+@pytest.mark.parametrize("skipna", (True, False))
+def test_weighted_sum_equal_weights(da, factor, skipna):
+    # if all weights are 'f'; weighted sum is f times the ordinary sum
+
+    da = DataArray(da)
+    weights = xr.full_like(da, factor)
+
+    expected = da.sum(skipna=skipna) * factor
+    result = da.weighted(weights).sum(skipna=skipna)
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"), (([1, 2], 5), ([0, 2], 4), ([0, 0], 0))
+)
+def test_weighted_sum_no_nan(weights, expected):
+
+    da = DataArray([1, 2])
+
+    weights = DataArray(weights)
+    result = da.weighted(weights).sum()
+    expected = DataArray(expected)
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"), (([1, 2], 4), ([0, 2], 4), ([1, 0], 0), ([0, 0], 0))
+)
+@pytest.mark.parametrize("skipna", (True, False))
+def test_weighted_sum_nan(weights, expected, skipna):
+
+    da = DataArray([np.nan, 2])
+
+    weights = DataArray(weights)
+    result = da.weighted(weights).sum(skipna=skipna)
+
+    if skipna:
+        expected = DataArray(expected)
+    else:
+        expected = DataArray(np.nan)
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.filterwarnings("ignore:Mean of empty slice")
+@pytest.mark.parametrize("da", ([1.0, 2], [1, np.nan], [np.nan, np.nan]))
+@pytest.mark.parametrize("skipna", (True, False))
+@pytest.mark.parametrize("factor", [1, 2, 3.14])
+def test_weighted_mean_equal_weights(da, skipna, factor):
+    # if all weights are equal (!= 0), should yield the same result as mean
+
+    da = DataArray(da)
+
+    # all weights as 1.
+    weights = xr.full_like(da, factor)
+
+    expected = da.mean(skipna=skipna)
+    result = da.weighted(weights).mean(skipna=skipna)
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"), (([4, 6], 1.6), ([1, 0], 1.0), ([0, 0], np.nan))
+)
+def test_weighted_mean_no_nan(weights, expected):
+
+    da = DataArray([1, 2])
+    weights = DataArray(weights)
+    expected = DataArray(expected)
+
+    result = da.weighted(weights).mean()
+
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"), (([4, 6], 2.0), ([1, 0], np.nan), ([0, 0], np.nan))
+)
+@pytest.mark.parametrize("skipna", (True, False))
+def test_weighted_mean_nan(weights, expected, skipna):
+
+    da = DataArray([np.nan, 2])
+    weights = DataArray(weights)
+
+    if skipna:
+        expected = DataArray(expected)
+    else:
+        expected = DataArray(np.nan)
+
+    result = da.weighted(weights).mean(skipna=skipna)
+
+    assert_equal(expected, result)
+
+
+def expected_weighted(da, weights, dim, skipna, operation):
+    """
+    Generate expected result using ``*`` and ``sum``. This is checked against
+    the result of da.weighted which uses ``dot``
+    """
+
+    weighted_sum = (da * weights).sum(dim=dim, skipna=skipna)
+
+    if operation == "sum":
+        return weighted_sum
+
+    masked_weights = weights.where(da.notnull())
+    sum_of_weights = masked_weights.sum(dim=dim, skipna=True)
+    valid_weights = sum_of_weights != 0
+    sum_of_weights = sum_of_weights.where(valid_weights)
+
+    if operation == "sum_of_weights":
+        return sum_of_weights
+
+    weighted_mean = weighted_sum / sum_of_weights
+
+    if operation == "mean":
+        return weighted_mean
+
+
+@pytest.mark.parametrize("dim", ("a", "b", "c", ("a", "b"), ("a", "b", "c"), None))
+@pytest.mark.parametrize("operation", ("sum_of_weights", "sum", "mean"))
+@pytest.mark.parametrize("add_nans", (True, False))
+@pytest.mark.parametrize("skipna", (None, True, False))
+@pytest.mark.parametrize("as_dataset", (True, False))
+def test_weighted_operations_3D(dim, operation, add_nans, skipna, as_dataset):
+
+    dims = ("a", "b", "c")
+    coords = dict(a=[0, 1, 2, 3], b=[0, 1, 2, 3], c=[0, 1, 2, 3])
+
+    weights = DataArray(np.random.randn(4, 4, 4), dims=dims, coords=coords)
+
+    data = np.random.randn(4, 4, 4)
+
+    # add approximately 25 % NaNs (https://stackoverflow.com/a/32182680/3010700)
+    if add_nans:
+        c = int(data.size * 0.25)
+        data.ravel()[np.random.choice(data.size, c, replace=False)] = np.NaN
+
+    data = DataArray(data, dims=dims, coords=coords)
+
+    if as_dataset:
+        data = data.to_dataset(name="data")
+
+    if operation == "sum_of_weights":
+        result = data.weighted(weights).sum_of_weights(dim)
+    else:
+        result = getattr(data.weighted(weights), operation)(dim, skipna=skipna)
+
+    expected = expected_weighted(data, weights, dim, skipna, operation)
+
+    assert_allclose(expected, result)
+
+
+@pytest.mark.parametrize("operation", ("sum_of_weights", "sum", "mean"))
+@pytest.mark.parametrize("as_dataset", (True, False))
+def test_weighted_operations_nonequal_coords(operation, as_dataset):
+
+    weights = DataArray(np.random.randn(4), dims=("a",), coords=dict(a=[0, 1, 2, 3]))
+    data = DataArray(np.random.randn(4), dims=("a",), coords=dict(a=[1, 2, 3, 4]))
+
+    if as_dataset:
+        data = data.to_dataset(name="data")
+
+    expected = expected_weighted(
+        data, weights, dim="a", skipna=None, operation=operation
+    )
+    result = getattr(data.weighted(weights), operation)(dim="a")
+
+    assert_allclose(expected, result)
+
+
+@pytest.mark.parametrize("dim", ("dim_0", None))
+@pytest.mark.parametrize("shape_data", ((4,), (4, 4), (4, 4, 4)))
+@pytest.mark.parametrize("shape_weights", ((4,), (4, 4), (4, 4, 4)))
+@pytest.mark.parametrize("operation", ("sum_of_weights", "sum", "mean"))
+@pytest.mark.parametrize("add_nans", (True, False))
+@pytest.mark.parametrize("skipna", (None, True, False))
+@pytest.mark.parametrize("as_dataset", (True, False))
+def test_weighted_operations_different_shapes(
+    dim, shape_data, shape_weights, operation, add_nans, skipna, as_dataset
+):
+
+    weights = DataArray(np.random.randn(*shape_weights))
+
+    data = np.random.randn(*shape_data)
+
+    # add approximately 25 % NaNs
+    if add_nans:
+        c = int(data.size * 0.25)
+        data.ravel()[np.random.choice(data.size, c, replace=False)] = np.NaN
+
+    data = DataArray(data)
+
+    if as_dataset:
+        data = data.to_dataset(name="data")
+
+    if operation == "sum_of_weights":
+        result = getattr(data.weighted(weights), operation)(dim)
+    else:
+        result = getattr(data.weighted(weights), operation)(dim, skipna=skipna)
+
+    expected = expected_weighted(data, weights, dim, skipna, operation)
+
+    assert_allclose(expected, result)
+
+
+@pytest.mark.parametrize("operation", ("sum_of_weights", "sum", "mean"))
+@pytest.mark.parametrize("as_dataset", (True, False))
+@pytest.mark.parametrize("keep_attrs", (True, False, None))
+def test_weighted_operations_keep_attr(operation, as_dataset, keep_attrs):
+
+    weights = DataArray(np.random.randn(2, 2), attrs=dict(attr="weights"))
+    data = DataArray(np.random.randn(2, 2))
+
+    if as_dataset:
+        data = data.to_dataset(name="data")
+
+    data.attrs = dict(attr="weights")
+
+    result = getattr(data.weighted(weights), operation)(keep_attrs=True)
+
+    if operation == "sum_of_weights":
+        assert weights.attrs == result.attrs
+    else:
+        assert data.attrs == result.attrs
+
+    result = getattr(data.weighted(weights), operation)(keep_attrs=None)
+    assert not result.attrs
+
+    result = getattr(data.weighted(weights), operation)(keep_attrs=False)
+    assert not result.attrs
+
+
+@pytest.mark.xfail(reason="xr.Dataset.map does not copy attrs of DataArrays GH: 3595")
+@pytest.mark.parametrize("operation", ("sum", "mean"))
+def test_weighted_operations_keep_attr_da_in_ds(operation):
+    # GH #3595
+
+    weights = DataArray(np.random.randn(2, 2))
+    data = DataArray(np.random.randn(2, 2), attrs=dict(attr="data"))
+    data = data.to_dataset(name="a")
+
+    result = getattr(data.weighted(weights), operation)(keep_attrs=True)
+
+    assert data.a.attrs == result.a.attrs