From 45fd0e63f43cf313b022a33aeec7f0f982e1908b Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 19 Nov 2019 14:06:45 +0000 Subject: [PATCH] Numpy 1.18 support (#3537) * Closes #3409 * Unpin versions * Rewrite unit test for clarity about its real scope * mean() on dask * Trivial * duck_array_ops should never receive xarray.Variable --- ci/azure/install.yml | 2 +- ci/requirements/py36.yml | 2 +- ci/requirements/py37.yml | 2 +- doc/whats-new.rst | 7 +++- xarray/core/dataset.py | 4 ++- xarray/core/duck_array_ops.py | 28 ++++++++++++++-- xarray/tests/test_dataset.py | 4 ++- xarray/tests/test_duck_array_ops.py | 50 +++++++++++++++++++---------- 8 files changed, 74 insertions(+), 25 deletions(-) diff --git a/ci/azure/install.yml b/ci/azure/install.yml index fee886ba804..baa69bcc8d5 100644 --- a/ci/azure/install.yml +++ b/ci/azure/install.yml @@ -16,9 +16,9 @@ steps: --pre \ --upgrade \ matplotlib \ + numpy \ pandas \ scipy - # numpy \ # FIXME https://github.com/pydata/xarray/issues/3409 pip install \ --no-deps \ --upgrade \ diff --git a/ci/requirements/py36.yml b/ci/requirements/py36.yml index 10fe69253e8..820160b19cc 100644 --- a/ci/requirements/py36.yml +++ b/ci/requirements/py36.yml @@ -25,7 +25,7 @@ dependencies: - nc-time-axis - netcdf4 - numba - - numpy<1.18 # FIXME https://github.com/pydata/xarray/issues/3409 + - numpy - pandas - pint - pip diff --git a/ci/requirements/py37.yml b/ci/requirements/py37.yml index 827c664a222..4a7aaf7d32b 100644 --- a/ci/requirements/py37.yml +++ b/ci/requirements/py37.yml @@ -25,7 +25,7 @@ dependencies: - nc-time-axis - netcdf4 - numba - - numpy<1.18 # FIXME https://github.com/pydata/xarray/issues/3409 + - numpy - pandas - pint - pip diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cb274bcaee8..0c929b5b711 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -115,6 +115,12 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. +- Add support for numpy >=1.18 (); bugfix mean() on datetime64 arrays on dask backend + (:issue:`3409`, :pull:`3537`). By `Guido Imperiale `_. +- Add support for pandas >=0.26 (:issue:`3440`). + By `Deepak Cherian `_. +- Add support for pseudonetcdf >=3.1 (:pull:`3485`). + By `Barron Henderson `_. Documentation ~~~~~~~~~~~~~ @@ -133,7 +139,6 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ - - Added integration tests against `pint `_. (:pull:`3238`, :pull:`3447`, :pull:`3493`, :pull:`3508`) by `Justus Magin `_. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5de254614ff..c631a4c11ea 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5316,7 +5316,9 @@ def _integrate_one(self, coord, datetime_unit=None): datetime_unit, _ = np.datetime_data(coord_var.dtype) elif datetime_unit is None: datetime_unit = "s" # Default to seconds for cftime objects - coord_var = datetime_to_numeric(coord_var, datetime_unit=datetime_unit) + coord_var = coord_var._replace( + data=datetime_to_numeric(coord_var.data, datetime_unit=datetime_unit) + ) variables = {} coord_names = set() diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 71e79335c3d..cf616acb485 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -351,6 +351,26 @@ def f(values, axis=None, skipna=None, **kwargs): _mean = _create_nan_agg_method("mean") +def _datetime_nanmin(array): + """nanmin() function for datetime64. + + Caveats that this function deals with: + + - In numpy < 1.18, min() on datetime64 incorrectly ignores NaT + - numpy nanmin() don't work on datetime64 (all versions at the moment of writing) + - dask min() does not work on datetime64 (all versions at the moment of writing) + """ + assert array.dtype.kind in "mM" + dtype = array.dtype + # (NaT).astype(float) does not produce NaN... + array = where(pandas_isnull(array), np.nan, array.astype(float)) + array = min(array, skipna=True) + if isinstance(array, float): + array = np.array(array) + # ...but (NaN).astype("M8") does produce NaT + return array.astype(dtype) + + def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): """Convert an array containing datetime-like data to an array of floats. @@ -370,7 +390,10 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): """ # TODO: make this function dask-compatible? if offset is None: - offset = array.min() + if array.dtype.kind in "Mm": + offset = _datetime_nanmin(array) + else: + offset = min(array) array = array - offset if not hasattr(array, "dtype"): # scalar is converted to 0d-array @@ -401,7 +424,8 @@ def mean(array, axis=None, skipna=None, **kwargs): array = asarray(array) if array.dtype.kind in "Mm": - offset = min(array) + offset = _datetime_nanmin(array) + # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = "timedelta64[ns]" return ( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index be40ce7c6e8..de074da541f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5874,7 +5874,9 @@ def test_trapz_datetime(dask, which_datetime): actual = da.integrate("time", datetime_unit="D") expected_data = np.trapz( - da, duck_array_ops.datetime_to_numeric(da["time"], datetime_unit="D"), axis=0 + da.data, + duck_array_ops.datetime_to_numeric(da["time"].data, datetime_unit="D"), + axis=0, ) expected = xr.DataArray( expected_data, diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index f678af2fec5..aee7bbd6b11 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -274,23 +274,39 @@ def assert_dask_array(da, dask): @arm_xfail -@pytest.mark.parametrize("dask", [False, True]) -def test_datetime_reduce(dask): - time = np.array(pd.date_range("15/12/1999", periods=11)) - time[8:11] = np.nan - da = DataArray(np.linspace(0, 365, num=11), dims="time", coords={"time": time}) - - if dask and has_dask: - chunks = {"time": 5} - da = da.chunk(chunks) - - actual = da["time"].mean() - assert not pd.isnull(actual) - actual = da["time"].mean(skipna=False) - assert pd.isnull(actual) - - # test for a 0d array - assert da["time"][0].mean() == da["time"][:1].mean() +@pytest.mark.parametrize("dask", [False, True] if has_dask else [False]) +def test_datetime_mean(dask): + # Note: only testing numpy, as dask is broken upstream + da = DataArray( + np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype="M8"), + dims=["time"], + ) + if dask: + # Trigger use case where a chunk is full of NaT + da = da.chunk({"time": 3}) + + expect = DataArray(np.array("2010-01-02", dtype="M8")) + expect_nat = DataArray(np.array("NaT", dtype="M8")) + + actual = da.mean() + if dask: + assert actual.chunks is not None + assert_equal(actual, expect) + + actual = da.mean(skipna=False) + if dask: + assert actual.chunks is not None + assert_equal(actual, expect_nat) + + # tests for 1d array full of NaT + assert_equal(da[[1]].mean(), expect_nat) + assert_equal(da[[1]].mean(skipna=False), expect_nat) + + # tests for a 0d array + assert_equal(da[0].mean(), da[0]) + assert_equal(da[0].mean(skipna=False), da[0]) + assert_equal(da[1].mean(), expect_nat) + assert_equal(da[1].mean(skipna=False), expect_nat) @requires_cftime