Merge branch 'master' into fix/dask-computes

dcherian · Oct 31, 2019 · 6e4c11f · 6e4c11f
2 parents 08f7f74 + 53c5199
commit 6e4c11f
Show file tree

Hide file tree

Showing 21 changed files with 400 additions and 78 deletions.
diff --git a/ci/requirements/py36-min-all-deps.yml b/ci/requirements/py36-min-all-deps.yml
@@ -13,7 +13,7 @@ dependencies:
   - cartopy=0.17
   - cdms2=3.1
   - cfgrib=0.9
-  - cftime=1.0.3  # FIXME need 1.0.5 (not released yet); 1.0.4 is broken
+  - cftime=1.0
   - coveralls
   - dask=1.2
   - distributed=1.27

diff --git a/ci/requirements/py36.yml b/ci/requirements/py36.yml
@@ -9,7 +9,7 @@ dependencies:
   - cartopy
   - cdms2
   - cfgrib
-  - cftime<1.0.4  # FIXME need 1.0.5 (not released yet); 1.0.4 is broken
+  - cftime
   - coveralls
   - dask
   - distributed

diff --git a/ci/requirements/py37-windows.yml b/ci/requirements/py37-windows.yml
@@ -9,7 +9,7 @@ dependencies:
   - cartopy
   # - cdms2  # Not available on Windows
   # - cfgrib  # Causes Python interpreter crash on Windows
-  - cftime<1.0.4  # FIXME need 1.0.5 (not released yet); 1.0.4 is broken
+  - cftime
   - coveralls
   - dask
   - distributed

diff --git a/ci/requirements/py37.yml b/ci/requirements/py37.yml
@@ -9,7 +9,7 @@ dependencies:
   - cartopy
   - cdms2
   - cfgrib
-  - cftime<1.0.4  # FIXME need 1.0.5 (not released yet); 1.0.4 is broken
+  - cftime
   - coveralls
   - dask
   - distributed

diff --git a/doc/data-structures.rst b/doc/data-structures.rst
@@ -411,7 +411,7 @@ Any variables using that dimension are dropped:
 
 As an alternate to dictionary-like modifications, you can use
 :py:meth:`~xarray.Dataset.assign` and :py:meth:`~xarray.Dataset.assign_coords`.
-These methods return a new dataset with additional (or replaced) or values:
+These methods return a new dataset with additional (or replaced) values:
 
 .. ipython:: python
 
@@ -420,7 +420,7 @@ These methods return a new dataset with additional (or replaced) or values:
 There is also the :py:meth:`~xarray.Dataset.pipe` method that allows you to use
 a method call with an external function (e.g., ``ds.pipe(func)``) instead of
 simply calling it (e.g., ``func(ds)``). This allows you to write pipelines for
-transforming you data (using "method chaining") instead of writing hard to
+transforming your data (using "method chaining") instead of writing hard to
 follow nested function calls:
 
 .. ipython:: python

diff --git a/doc/examples/monthly-means.rst b/doc/examples/monthly-means.rst
@@ -83,7 +83,7 @@ the ``calendar.month_range`` function.
 
         for i, (month, year) in enumerate(zip(time.month, time.year)):
             month_length[i] = cal_days[month]
-            if leap_year(year, calendar=calendar):
+            if leap_year(year, calendar=calendar) and month == 2:
                 month_length[i] += 1
         return month_length
 

diff --git a/doc/terminology.rst b/doc/terminology.rst
@@ -15,7 +15,7 @@ Terminology
 
 ----
 
-**Variable:** A `NetCDF-like variable <https://www.unidata.ucar.edu/software/netcdf/netcdf/Variables.html>`_ consisting of dimensions, data, and attributes which describe a single array. The main functional difference between variables and numpy arrays is that numerical operations on variables implement array broadcasting by dimension name. Each ``DataArray`` has an underlying variable that can be accessed via ``arr.variable``. However, a variable is not fully described outside of either a ``Dataset`` or a ``DataArray``.
+**Variable:** A `NetCDF-like variable <https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_set_components.html#variables>`_ consisting of dimensions, data, and attributes which describe a single array. The main functional difference between variables and numpy arrays is that numerical operations on variables implement array broadcasting by dimension name. Each ``DataArray`` has an underlying variable that can be accessed via ``arr.variable``. However, a variable is not fully described outside of either a ``Dataset`` or a ``DataArray``.
 
 .. note::
 
@@ -39,4 +39,4 @@ Terminology
 
 ----
 
-**Index:** An *index* is a data structure optimized for efficient selecting and slicing of an associated array. Xarray creates indexes for dimension coordinates so that operations along dimensions are fast, while non-dimension coordinates are not indexed. Under the hood, indexes are implemented as :py:class:`pandas.Index` objects. The index associated with dimension name ``x`` can be retrieved by ``arr.indexes[x]``. By construction, ``len(arr.dims) == len(arr.indexes)``
+**Index:** An *index* is a data structure optimized for efficient selecting and slicing of an associated array. Xarray creates indexes for dimension coordinates so that operations along dimensions are fast, while non-dimension coordinates are not indexed. Under the hood, indexes are implemented as :py:class:`pandas.Index` objects. The index associated with dimension name ``x`` can be retrieved by ``arr.indexes[x]``. By construction, ``len(arr.dims) == len(arr.indexes)``
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -21,12 +21,20 @@ v0.14.1 (unreleased)
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
-- Minimum cftime version is now 1.0.3. By `Deepak Cherian <https://github.com/dcherian>`_.
+- Broken compatibility with cftime < 1.0.3.
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+
+  .. note::
+
+    cftime version 1.0.4 is broken
+    (`cftime/126 <https://github.com/Unidata/cftime/issues/126>`_);
+    please use version 1.0.4.2 instead.
+
 - All leftover support for dates from non-standard calendars through netcdftime, the
   module included in versions of netCDF4 prior to 1.4 that eventually became the
   cftime package, has been removed in favor of relying solely on the standalone
-  cftime package (:pull:`3450`).  By `Spencer Clark 
-  <https://github.com/spencerkclark>`_. 
+  cftime package (:pull:`3450`).
+  By `Spencer Clark <https://github.com/spencerkclark>`_.
 
 New Features
 ~~~~~~~~~~~~
@@ -37,24 +45,24 @@ New Features
 - Changed `xr.ALL_DIMS` to equal python's `Ellipsis` (`...`), and changed internal usages to use
   `...` directly. As before, you can use this to instruct a `groupby` operation
   to reduce over all dimensions. While we have no plans to remove `xr.ALL_DIMS`, we suggest
-  using `...`.
+  using `...`. (:pull:`3418`)
   By `Maximilian Roos <https://github.com/max-sixty>`_
-- Added integration tests against `pint <https://pint.readthedocs.io/>`_.
-  (:pull:`3238`) by `Justus Magin <https://github.com/keewis>`_.
-
-  .. note::
-
-    At the moment of writing, these tests *as well as the ability to use pint in general*
-    require `a highly experimental version of pint
-    <https://github.com/andrewgsavage/pint/pull/6>`_ (install with
-    ``pip install git+https://github.com/andrewgsavage/pint.git@refs/pull/6/head)``.
-    Even with it, interaction with non-numpy array libraries, e.g. dask or sparse, is broken.
-
+- :py:func:`~xarray.dot`, and :py:func:`~xarray.DataArray.dot` now support the
+  `dims=...` option to sum over the union of dimensions of all input arrays
+  (:issue:`3423`) by `Mathias Hauser <https://github.com/mathause>`_.
 - Added new :py:meth:`Dataset._repr_html_` and :py:meth:`DataArray._repr_html_` to improve
   representation of objects in jupyter. By default this feature is turned off
   for now. Enable it with :py:meth:`xarray.set_options(display_style="html")`.
   (:pull:`3425`) by `Benoit Bovy <https://github.com/benbovy>`_ and
   `Julia Signell <https://github.com/jsignell>`_.
+- Implement `dask deterministic hashing
+  <https://docs.dask.org/en/latest/custom-collections.html#deterministic-hashing>`_
+  for xarray objects. Note that xarray objects with a dask.array backend already used
+  deterministic hashing in previous releases; this change implements it when whole
+  xarray objects are embedded in a dask graph, e.g. when :meth:`DataArray.map` is
+  invoked. (:issue:`3378`, :pull:`3446`)
+  By `Deepak Cherian <https://github.com/dcherian>`_ and
+  `Guido Imperiale <https://github.com/crusaderky>`_.
 
 Bug fixes
 ~~~~~~~~~
@@ -73,20 +81,36 @@ Bug fixes
 
 Documentation
 ~~~~~~~~~~~~~
-
+- Fix leap year condition in example (http://xarray.pydata.org/en/stable/examples/monthly-means.html) by `Mickaël Lalande <https://github.com/mickaellalande>`_.
 - Fix the documentation of :py:meth:`DataArray.resample` and
   :py:meth:`Dataset.resample` and explicitly state that a
   datetime-like dimension is required. (:pull:`3400`)
   By `Justus Magin <https://github.com/keewis>`_.
 - Update the terminology page to address multidimensional coordinates. (:pull:`3410`)
   By `Jon Thielen <https://github.com/jthielen>`_.
+- Fix the documentation of :py:meth:`Dataset.integrate` and
+  :py:meth:`DataArray.integrate` and add an example to
+  :py:meth:`Dataset.integrate`. (:pull:`3469`)
+  By `Justus Magin <https://github.com/keewis>`_.
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- Added integration tests against `pint <https://pint.readthedocs.io/>`_.
+  (:pull:`3238`) by `Justus Magin <https://github.com/keewis>`_.
+
+  .. note::
+
+    At the moment of writing, these tests *as well as the ability to use pint in general*
+    require `a highly experimental version of pint
+    <https://github.com/andrewgsavage/pint/pull/6>`_ (install with
+    ``pip install git+https://github.com/andrewgsavage/pint.git@refs/pull/6/head)``.
+    Even with it, interaction with non-numpy array libraries, e.g. dask or sparse, is broken.
+
 - Use Python 3.6 idioms throughout the codebase. (:pull:3419)
   By `Maximilian Roos <https://github.com/max-sixty>`_
 
+
 .. _whats-new.0.14.0:
 
 v0.14.0 (14 Oct 2019)

diff --git a/properties/conftest.py b/properties/conftest.py
@@ -0,0 +1,8 @@
+try:
+    from hypothesis import settings
+except ImportError:
+    pass
+else:
+    # Run for a while - arrays are a bigger search space than usual
+    settings.register_profile("ci", deadline=None, print_blob=True)
+    settings.load_profile("ci")
diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py
@@ -10,15 +10,10 @@
 
 import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 
 import xarray as xr
 
-# Run for a while - arrays are a bigger search space than usual
-settings.register_profile("ci", deadline=None)
-settings.load_profile("ci")
-
-
 an_array = npst.arrays(
     dtype=st.one_of(
         npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes()

diff --git a/properties/test_pandas_roundtrip.py b/properties/test_pandas_roundtrip.py
@@ -0,0 +1,97 @@
+"""
+Property-based tests for roundtripping between xarray and pandas objects.
+"""
+import pytest
+
+pytest.importorskip("hypothesis")
+
+from functools import partial
+import hypothesis.extra.numpy as npst
+import hypothesis.extra.pandas as pdst
+import hypothesis.strategies as st
+from hypothesis import given
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+numeric_dtypes = st.one_of(
+    npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes()
+)
+
+numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt))
+
+an_array = npst.arrays(
+    dtype=numeric_dtypes,
+    shape=npst.array_shapes(max_dims=2),  # can only convert 1D/2D to pandas
+)
+
+
+@st.composite
+def datasets_1d_vars(draw):
+    """Generate datasets with only 1D variables
+
+    Suitable for converting to pandas dataframes.
+    """
+    # Generate an index for the dataset
+    idx = draw(pdst.indexes(dtype="u8", min_size=0, max_size=100))
+
+    # Generate 1-3 variables, 1D with the same length as the index
+    vars_strategy = st.dictionaries(
+        keys=st.text(),
+        values=npst.arrays(dtype=numeric_dtypes, shape=len(idx)).map(
+            partial(xr.Variable, ("rows",))
+        ),
+        min_size=1,
+        max_size=3,
+    )
+    return xr.Dataset(draw(vars_strategy), coords={"rows": idx})
+
+
+@given(st.data(), an_array)
+def test_roundtrip_dataarray(data, arr):
+    names = data.draw(
+        st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map(
+            tuple
+        )
+    )
+    coords = {name: np.arange(n) for (name, n) in zip(names, arr.shape)}
+    original = xr.DataArray(arr, dims=names, coords=coords)
+    roundtripped = xr.DataArray(original.to_pandas())
+    xr.testing.assert_identical(original, roundtripped)
+
+
+@given(datasets_1d_vars())
+def test_roundtrip_dataset(dataset):
+    df = dataset.to_dataframe()
+    assert isinstance(df, pd.DataFrame)
+    roundtripped = xr.Dataset(df)
+    xr.testing.assert_identical(dataset, roundtripped)
+
+
+@given(numeric_series, st.text())
+def test_roundtrip_pandas_series(ser, ix_name):
+    # Need to name the index, otherwise Xarray calls it 'dim_0'.
+    ser.index.name = ix_name
+    arr = xr.DataArray(ser)
+    roundtripped = arr.to_pandas()
+    pd.testing.assert_series_equal(ser, roundtripped)
+    xr.testing.assert_identical(arr, roundtripped.to_xarray())
+
+
+# Dataframes with columns of all the same dtype - for roundtrip to DataArray
+numeric_homogeneous_dataframe = numeric_dtypes.flatmap(
+    lambda dt: pdst.data_frames(columns=pdst.columns(["a", "b", "c"], dtype=dt))
+)
+
+
+@pytest.mark.xfail
+@given(numeric_homogeneous_dataframe)
+def test_roundtrip_pandas_dataframe(df):
+    # Need to name the indexes, otherwise Xarray names them 'dim_0', 'dim_1'.
+    df.index.name = "rows"
+    df.columns.name = "cols"
+    arr = xr.DataArray(df)
+    roundtripped = arr.to_pandas()
+    pd.testing.assert_frame_equal(df, roundtripped)
+    xr.testing.assert_identical(arr, roundtripped.to_xarray())
diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -884,7 +884,7 @@ def apply_ufunc(
     Plain scalars, numpy arrays and a mix of these with xarray objects is also
     supported:
 
-    >>> magnitude(4, 5)
+    >>> magnitude(3, 4)
     5.0
     >>> magnitude(3, np.array([0, 4]))
     array([3., 5.])
@@ -1055,9 +1055,9 @@ def dot(*arrays, dims=None, **kwargs):
     ----------
     arrays: DataArray (or Variable) objects
         Arrays to compute.
-    dims: str or tuple of strings, optional
-        Which dimensions to sum over.
-        If not speciified, then all the common dimensions are summed over.
+    dims: '...', str or tuple of strings, optional
+        Which dimensions to sum over. Ellipsis ('...') sums over all dimensions.
+        If not specified, then all the common dimensions are summed over.
     **kwargs: dict
         Additional keyword arguments passed to numpy.einsum or
         dask.array.einsum
@@ -1070,7 +1070,7 @@ def dot(*arrays, dims=None, **kwargs):
     --------
 
     >>> import numpy as np
-    >>> import xarray as xp
+    >>> import xarray as xr
     >>> da_a = xr.DataArray(np.arange(3 * 2).reshape(3, 2), dims=['a', 'b'])
     >>> da_b = xr.DataArray(np.arange(3 * 2 * 2).reshape(3, 2, 2),
     ...                     dims=['a', 'b', 'c'])
@@ -1117,6 +1117,14 @@ def dot(*arrays, dims=None, **kwargs):
            [273, 446, 619]])
     Dimensions without coordinates: a, d
 
+    >>> xr.dot(da_a, da_b)
+    <xarray.DataArray (c: 2)>
+    array([110, 125])
+    Dimensions without coordinates: c
+
+    >>> xr.dot(da_a, da_b, dims=...)
+    <xarray.DataArray ()>
+    array(235)
     """
     from .dataarray import DataArray
     from .variable import Variable
@@ -1141,7 +1149,9 @@ def dot(*arrays, dims=None, **kwargs):
     einsum_axes = "abcdefghijklmnopqrstuvwxyz"
     dim_map = {d: einsum_axes[i] for i, d in enumerate(all_dims)}
 
-    if dims is None:
+    if dims is ...:
+        dims = all_dims
+    elif dims is None:
         # find dimensions that occur more than one times
         dim_counts = Counter()
         for arr in arrays: