From 80936fd4a561f7ad4ca6a15fdbd774bc4fc33825 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 30 Jan 2020 11:40:22 -0500 Subject: [PATCH 01/20] [WIP] Implementation of polyfit and polyval - minimum testing - no docs --- xarray/core/dask_array_ops.py | 12 +++++ xarray/core/dataarray.py | 97 ++++++++++++++++++++++++++++++++++ xarray/core/duck_array_ops.py | 9 ++++ xarray/core/nputils.py | 18 +++++++ xarray/tests/test_dataarray.py | 31 +++++++++++ 5 files changed, 167 insertions(+) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 37f261cc3ad..811483c1d70 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -95,3 +95,15 @@ def func(x, window, axis=-1): # crop boundary. index = (slice(None),) * axis + (slice(drop_size, drop_size + orig_shape[axis]),) return out[index] + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + import dask.array as da + lhs_da = da.from_array(lhs, chunks=(rhs.chunks[0], lhs.shape[1])) + if skipna: + results = da.apply_along_axis(nputils._nanpolyfit_1d, 0, rhs.data, lhs_da.data, dtype=float, shape=(lhs.shape[1] + 1,), rcond=rcond) + coeffs = results[:-1, ...] + residuals = results[-1, ...] + else: + coeffs, residuals, _, _ = da.linalg.lstsq(lhs_da.data, rhs.data) + return coeffs, residuals diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a658f125054..02a4df59794 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -26,6 +26,7 @@ from . import ( computation, dtypes, + duck_array_ops, groupby, indexing, ops, @@ -54,6 +55,7 @@ from .indexes import Indexes, default_indexes, propagate_indexes from .indexing import is_fancy_indexer from .merge import PANDAS_TYPES, _extract_indexes_from_coords +from .missing import get_clean_interp_index from .options import OPTIONS from .utils import Default, ReprObject, _check_inplace, _default, either_dict_or_kwargs from .variable import ( @@ -3234,10 +3236,105 @@ def map_blocks( return map_blocks(func, self, args, kwargs) + def polyfit( + self, + dim : Hashable, + deg : int, + rcond : float = None, + w : Union[Hashable, Any] = None, + full : bool = False, + cov : bool = False, + skipna : bool = False + ): + x = get_clean_interp_index(self, dim) + order = int(deg) + 1 + lhs = np.vander(x, order, increasing=True) + + dims_to_stack = [dimname for dimname in self.dims if dimname != dim] + stacked_dim = utils.get_temp_dimname(dims_to_stack, 'stacked') + rhs = self.transpose(dim, *dims_to_stack).stack({stacked_dim: dims_to_stack}) + + if rcond is None: + rcond = x.shape[0] * np.core.finfo(x.dtype).eps + + # Weights: + if w is not None: + if isinstance(w, Hashable): + w = self[w] + w = np.asarray(w) + if w.ndim != 1: + raise TypeError("Expected a 1-d array for weights.") + if w.shape[0] != lhs.shape[0]: + raise TypeError("Expected w and {} to have the same length".format(dim)) + lhs *= w[:, np.newaxis] + rhs *= w[:, np.newaxis] + + # Scaling + scale = np.sqrt((lhs * lhs).sum(axis=0)) + lhs /= scale + + coeffs, residuals = duck_array_ops.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) + + degree_dim = utils.get_temp_dimname(dims_to_stack, 'degree') + coeffs = DataArray(coeffs / scale[:, np.newaxis], dims=(degree_dim, stacked_dim), + coords={degree_dim: np.arange(order), + stacked_dim: rhs[stacked_dim]}, + name=(self.name or '') + '_polyfit_coefficients' + ).unstack(stacked_dim) + + rank = np.linalg.matrix_rank(lhs) + if rank != order and not full: + warnings.warn("Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4) + + if full or (cov is True): + residuals = DataArray(residuals, dims=(stacked_dim), coords={stacked_dim: rhs[stacked_dim]}, + name=(self.name or '') + '_polyfit_residuals' + ).unstack(stacked_dim) + if full: + sing = np.linalg.svd(lhs, compute_uv=False) + sing = DataArray(sing, dims=(degree_dim,), coords={degree_dim: np.arange(order)}, + name=(self.name or '') + '_polyfit_singular_values') + return coeffs, residuals, rank, sing, rcond + if cov: + Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) + Vbase /= np.outer(scale, scale) + if cov == 'unscaled': + fac = 1 + else: + if x.shape[0] <= order: + raise ValueError("The number of data points must exceed order to scale the covariance matrix.") + fac = residuals / (x.shape[0] - order) + covariance = DataArray(Vbase, dims=('cov_i', 'cov_j'), name=(self.name or '') + '_polyfit_covariance') * fac + return coeffs, covariance + return coeffs + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = property(StringAccessor) +def polyval( + coord : Union["DataArray", Any], + coefficients : Union["DataArray", Any], + degree_dim : Union[str, int] = 'degree' +) -> "DataArray" : + if not isinstance(coord, DataArray): + coord = DataArray(coord, dims=('x',), name='x') + x = get_clean_interp_index(coord, coord.name) + + if isinstance(degree_dim, str): + deg_coord = coefficients[degree_dim] + else: # Axis number + if isinstance(coefficients, DataArray): + deg_coord = coefficients[coefficients.dims[degree_dim]] + else: + deg_coord = np.arange(coefficients.shape[degree_dim]) + + lhs = DataArray(np.vander(x, int(deg_coord.max()) + 1, increasing=True), + dims=(coord.name, degree_dim), + coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)}) + return (lhs * coefficients).sum(degree_dim) + + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index c2fe604a9d3..df81d93f9a0 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -600,3 +600,12 @@ def rolling_window(array, axis, window, center, fill_value): return dask_array_ops.rolling_window(array, axis, window, center, fill_value) else: # np.ndarray return nputils.rolling_window(array, axis, window, center, fill_value) + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + """Return the coefficients and residuals of a least-squares fit. + """ + if isinstance(rhs, dask_array_type): + return dask_array_ops.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) + else: + return nputils.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index dba67174fc1..1513f9055dd 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -221,6 +221,24 @@ def f(values, axis=None, **kwargs): return f +def _nanpolyfit_1d(arr, x, rcond=None): + out = np.full((x.shape[1] + 1,), np.nan) + mask = np.isnan(arr) + if not np.all(mask): + out[:-1], out[-1], _, _ = np.linalg.lstsq(x[~mask, :], arr[~mask], rcond=rcond) + return out + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs.data, rcond=rcond) + if skipna: + nan_cols = np.nonzero(np.isnan(coeffs[0, :]))[0] + out = np.apply_along_axis(_nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs, rcond=rcond) + coeffs[:, nan_cols] = out[:-1, :] + residuals[nan_cols] = out[-1, :] + return coeffs, residuals + + nanmin = _create_bottleneck_method("nanmin") nanmax = _create_bottleneck_method("nanmax") nanmean = _create_bottleneck_method("nanmean") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index b9b719e8af9..c41330c4ba8 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4152,6 +4152,37 @@ def test_rank(self): y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=("z",)) assert_equal(y.rank("z", pct=True), y) + def test_polyfit(self): + x = np.arange(10) + da = DataArray(np.stack((1. + x + 2. * x**2, 1. + 2. * x + 3. * x**2)), + dims=('d', 'x'), coords={'x': x, 'd': [0, 1]}) + coeffs = da.polyfit('x', 2) + expected = DataArray([[1, 1, 2], [1, 2, 3]], dims=('d', 'degree'), + coords={'degree': [0, 1, 2], 'd': [0, 1]}).T + assert_allclose(coeffs, expected) + + # With NaN + da[0, 1] = np.nan + coeffs = da.polyfit('x', 2) + expected_na = DataArray([[np.nan, np.nan, np.nan], [1, 2, 3]], dims=('d', 'degree'), + coords={'degree': [0, 1, 2], 'd': [0, 1]}).T + assert_allclose(coeffs, expected_na) + + # Skipna + Full output + coeffs, resids, rank, sing, rcond = da.polyfit('x', 2, skipna=True, full=True) + assert_allclose(coeffs, expected) + assert rank == np.linalg.matrix_rank(np.vander(x, 3)) + np.testing.assert_almost_equal(resids, [0, 0]) + + +def test_polyval(): + x = np.arange(10) + da = DataArray(np.stack((1. + x + 2. * x**2, 1. + 2. * x + 3. * x**2)), + dims=('d', 'x'), coords={'x': x, 'd': [0, 1]}) + coeffs = da.polyfit('x', 2) + da_pv = xr.core.dataarray.polyval(da.x, coeffs) + assert_allclose(da, da_pv) + @pytest.fixture(params=[1]) def da(request): From 05355af618ccfc68c061c20ec3318e974f71e497 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 30 Jan 2020 11:49:45 -0500 Subject: [PATCH 02/20] Formatting with black, flake8 --- xarray/core/dask_array_ops.py | 11 +++- xarray/core/dataarray.py | 92 ++++++++++++++++++++++------------ xarray/core/nputils.py | 4 +- xarray/tests/test_dataarray.py | 36 ++++++++----- 4 files changed, 97 insertions(+), 46 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 811483c1d70..961de63f664 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -99,9 +99,18 @@ def func(x, window, axis=-1): def least_squares(lhs, rhs, rcond=None, skipna=False): import dask.array as da + lhs_da = da.from_array(lhs, chunks=(rhs.chunks[0], lhs.shape[1])) if skipna: - results = da.apply_along_axis(nputils._nanpolyfit_1d, 0, rhs.data, lhs_da.data, dtype=float, shape=(lhs.shape[1] + 1,), rcond=rcond) + results = da.apply_along_axis( + nputils._nanpolyfit_1d, + 0, + rhs.data, + lhs_da.data, + dtype=float, + shape=(lhs.shape[1] + 1,), + rcond=rcond, + ) coeffs = results[:-1, ...] residuals = results[-1, ...] else: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 02a4df59794..b59d87f5f9e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3238,20 +3238,20 @@ def map_blocks( def polyfit( self, - dim : Hashable, - deg : int, - rcond : float = None, - w : Union[Hashable, Any] = None, - full : bool = False, - cov : bool = False, - skipna : bool = False + dim: Hashable, + deg: int, + rcond: float = None, + w: Union[Hashable, Any] = None, + full: bool = False, + cov: bool = False, + skipna: bool = False, ): x = get_clean_interp_index(self, dim) order = int(deg) + 1 lhs = np.vander(x, order, increasing=True) dims_to_stack = [dimname for dimname in self.dims if dimname != dim] - stacked_dim = utils.get_temp_dimname(dims_to_stack, 'stacked') + stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") rhs = self.transpose(dim, *dims_to_stack).stack({stacked_dim: dims_to_stack}) if rcond is None: @@ -3273,38 +3273,64 @@ def polyfit( scale = np.sqrt((lhs * lhs).sum(axis=0)) lhs /= scale - coeffs, residuals = duck_array_ops.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) + coeffs, residuals = duck_array_ops.least_squares( + lhs, rhs, rcond=rcond, skipna=skipna + ) + + if self.name: + name = "{}_".format(self.name) + else: + name = "" - degree_dim = utils.get_temp_dimname(dims_to_stack, 'degree') - coeffs = DataArray(coeffs / scale[:, np.newaxis], dims=(degree_dim, stacked_dim), - coords={degree_dim: np.arange(order), - stacked_dim: rhs[stacked_dim]}, - name=(self.name or '') + '_polyfit_coefficients' - ).unstack(stacked_dim) + degree_dim = utils.get_temp_dimname(dims_to_stack, "degree") + coeffs = DataArray( + coeffs / scale[:, np.newaxis], + dims=(degree_dim, stacked_dim), + coords={degree_dim: np.arange(order), stacked_dim: rhs[stacked_dim]}, + name=name + "_polyfit_coefficients", + ).unstack(stacked_dim) rank = np.linalg.matrix_rank(lhs) if rank != order and not full: - warnings.warn("Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4) + warnings.warn( + "Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4 + ) if full or (cov is True): - residuals = DataArray(residuals, dims=(stacked_dim), coords={stacked_dim: rhs[stacked_dim]}, - name=(self.name or '') + '_polyfit_residuals' - ).unstack(stacked_dim) + residuals = DataArray( + residuals, + dims=(stacked_dim), + coords={stacked_dim: rhs[stacked_dim]}, + name=name + "_polyfit_residuals", + ).unstack(stacked_dim) if full: sing = np.linalg.svd(lhs, compute_uv=False) - sing = DataArray(sing, dims=(degree_dim,), coords={degree_dim: np.arange(order)}, - name=(self.name or '') + '_polyfit_singular_values') + sing = DataArray( + sing, + dims=(degree_dim,), + coords={degree_dim: np.arange(order)}, + name=name + "_polyfit_singular_values", + ) return coeffs, residuals, rank, sing, rcond if cov: Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) Vbase /= np.outer(scale, scale) - if cov == 'unscaled': + if cov == "unscaled": fac = 1 else: if x.shape[0] <= order: - raise ValueError("The number of data points must exceed order to scale the covariance matrix.") + raise ValueError( + "The number of data points must exceed order to scale the covariance matrix." + ) fac = residuals / (x.shape[0] - order) - covariance = DataArray(Vbase, dims=('cov_i', 'cov_j'), name=(self.name or '') + '_polyfit_covariance') * fac + covariance = ( + DataArray( + Vbase, + dims=("cov_i", "cov_j"), + name=name + "_polyfit_covariance", + ) + * fac + ) return coeffs, covariance return coeffs @@ -3314,12 +3340,12 @@ def polyfit( def polyval( - coord : Union["DataArray", Any], - coefficients : Union["DataArray", Any], - degree_dim : Union[str, int] = 'degree' -) -> "DataArray" : + coord: Union["DataArray", Any], + coefficients: Union["DataArray", Any], + degree_dim: Union[str, int] = "degree", +) -> "DataArray": if not isinstance(coord, DataArray): - coord = DataArray(coord, dims=('x',), name='x') + coord = DataArray(coord, dims=("x",), name="x") x = get_clean_interp_index(coord, coord.name) if isinstance(degree_dim, str): @@ -3330,9 +3356,11 @@ def polyval( else: deg_coord = np.arange(coefficients.shape[degree_dim]) - lhs = DataArray(np.vander(x, int(deg_coord.max()) + 1, increasing=True), - dims=(coord.name, degree_dim), - coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)}) + lhs = DataArray( + np.vander(x, int(deg_coord.max()) + 1, increasing=True), + dims=(coord.name, degree_dim), + coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)}, + ) return (lhs * coefficients).sum(degree_dim) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 1513f9055dd..201dcd9ffb9 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -233,7 +233,9 @@ def least_squares(lhs, rhs, rcond=None, skipna=False): coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs.data, rcond=rcond) if skipna: nan_cols = np.nonzero(np.isnan(coeffs[0, :]))[0] - out = np.apply_along_axis(_nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs, rcond=rcond) + out = np.apply_along_axis( + _nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs, rcond=rcond + ) coeffs[:, nan_cols] = out[:-1, :] residuals[nan_cols] = out[-1, :] return coeffs, residuals diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c41330c4ba8..09b80bdd4d3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4154,22 +4154,31 @@ def test_rank(self): def test_polyfit(self): x = np.arange(10) - da = DataArray(np.stack((1. + x + 2. * x**2, 1. + 2. * x + 3. * x**2)), - dims=('d', 'x'), coords={'x': x, 'd': [0, 1]}) - coeffs = da.polyfit('x', 2) - expected = DataArray([[1, 1, 2], [1, 2, 3]], dims=('d', 'degree'), - coords={'degree': [0, 1, 2], 'd': [0, 1]}).T + da = DataArray( + np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + dims=("d", "x"), + coords={"x": x, "d": [0, 1]}, + ) + coeffs = da.polyfit("x", 2) + expected = DataArray( + [[1, 1, 2], [1, 2, 3]], + dims=("d", "degree"), + coords={"degree": [0, 1, 2], "d": [0, 1]}, + ).T assert_allclose(coeffs, expected) # With NaN da[0, 1] = np.nan - coeffs = da.polyfit('x', 2) - expected_na = DataArray([[np.nan, np.nan, np.nan], [1, 2, 3]], dims=('d', 'degree'), - coords={'degree': [0, 1, 2], 'd': [0, 1]}).T + coeffs = da.polyfit("x", 2) + expected_na = DataArray( + [[np.nan, np.nan, np.nan], [1, 2, 3]], + dims=("d", "degree"), + coords={"degree": [0, 1, 2], "d": [0, 1]}, + ).T assert_allclose(coeffs, expected_na) # Skipna + Full output - coeffs, resids, rank, sing, rcond = da.polyfit('x', 2, skipna=True, full=True) + coeffs, resids, rank, sing, rcond = da.polyfit("x", 2, skipna=True, full=True) assert_allclose(coeffs, expected) assert rank == np.linalg.matrix_rank(np.vander(x, 3)) np.testing.assert_almost_equal(resids, [0, 0]) @@ -4177,9 +4186,12 @@ def test_polyfit(self): def test_polyval(): x = np.arange(10) - da = DataArray(np.stack((1. + x + 2. * x**2, 1. + 2. * x + 3. * x**2)), - dims=('d', 'x'), coords={'x': x, 'd': [0, 1]}) - coeffs = da.polyfit('x', 2) + da = DataArray( + np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + dims=("d", "x"), + coords={"x": x, "d": [0, 1]}, + ) + coeffs = da.polyfit("x", 2) da_pv = xr.core.dataarray.polyval(da.x, coeffs) assert_allclose(da, da_pv) From 7f26cb17df807f5b9c8e851903a9592e37be7435 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 30 Jan 2020 12:43:37 -0500 Subject: [PATCH 03/20] Fix failing test --- xarray/tests/test_dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 09b80bdd4d3..f7863083cb7 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4193,7 +4193,7 @@ def test_polyval(): ) coeffs = da.polyfit("x", 2) da_pv = xr.core.dataarray.polyval(da.x, coeffs) - assert_allclose(da, da_pv) + assert_allclose(da, da_pv.T) @pytest.fixture(params=[1]) From fdc181985a47de84d6ddc9d738bd893863210029 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 30 Jan 2020 14:27:46 -0500 Subject: [PATCH 04/20] More intelligent skipna switching --- xarray/core/dataarray.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b59d87f5f9e..41fa27615fe 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -31,6 +31,7 @@ indexing, ops, pdcompat, + pycompat, resample, rolling, utils, @@ -3244,8 +3245,14 @@ def polyfit( w: Union[Hashable, Any] = None, full: bool = False, cov: bool = False, - skipna: bool = False, + skipna: bool = None, ): + if skipna is None: + if isinstance(self.data, pycompat.dask_array_type): + skipna = True + else: + skipna = np.any(self.isnull()) + x = get_clean_interp_index(self, dim) order = int(deg) + 1 lhs = np.vander(x, order, increasing=True) @@ -3325,9 +3332,7 @@ def polyfit( fac = residuals / (x.shape[0] - order) covariance = ( DataArray( - Vbase, - dims=("cov_i", "cov_j"), - name=name + "_polyfit_covariance", + Vbase, dims=("cov_i", "cov_j"), name=name + "_polyfit_covariance", ) * fac ) From 924f4ebdf5fb8e6af33d6d512b421c4e15302d8f Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 31 Jan 2020 12:20:17 -0500 Subject: [PATCH 05/20] Add docs | Change coeff order to fit numpy | move polyval --- xarray/__init__.py | 3 +- xarray/core/computation.py | 46 ++++++++++++++++++++++++++++++ xarray/core/dataarray.py | 48 ++++++++++++++------------------ xarray/tests/test_computation.py | 12 ++++++++ xarray/tests/test_dataarray.py | 27 ++++-------------- 5 files changed, 86 insertions(+), 50 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 44dc66411c4..28bb901b251 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -15,7 +15,7 @@ from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, dot, where +from .core.computation import apply_ufunc, dot, where, polyval from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset @@ -65,6 +65,7 @@ "open_mfdataset", "open_rasterio", "open_zarr", + "polyval", "register_dataarray_accessor", "register_dataset_accessor", "save_mfdataset", diff --git a/xarray/core/computation.py b/xarray/core/computation.py index d2c5c32bc00..d27b1562857 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1299,3 +1299,49 @@ def where(cond, x, y): dataset_join="exact", dask="allowed", ) + + +def polyval(coord, coeffs, degree_dim="degree"): + """Evaluate a polynomial at specific values + + Parameters + ---------- + coord : DataArray or array-like + The 1D coordinate along which to evaluate the polynomial. + coeffs : DataArray or array-like + Coefficients of the polynomials. + degree_dim : str or int, optional + Name of dimension or axis number along which the degree increases in `coeffs`. + + See also + -------- + `xarray.DataArray.polyfit` + `numpy.polyval` + """ + from .dataarray import DataArray + from .missing import get_clean_interp_index + + if not isinstance(coord, DataArray): + coord = DataArray(coord, dims=("x",), name="x") + x = get_clean_interp_index(coord, coord.name) + + if isinstance(degree_dim, str): + deg_coord = coeffs[degree_dim] + else: # Axis number + degree_axis = degree_dim + if isinstance(coeffs, DataArray): + degree_dim = coeffs.dims[degree_axis] + deg_coord = coeffs[degree_dim] + else: + degree_dim = "degree" + deg_coord = np.arange(coeffs.shape[degree_axis])[::-1] + coeffs = DataArray(coeffs) + coeffs.coords[coeffs.dims[degree_axis]] = deg_coord + coeffs = coeffs.rename({coeffs.dims[degree_axis]: degree_dim}) + + lhs = DataArray( + np.vander(x, int(deg_coord.max()) + 1), + dims=(coord.name, degree_dim), + coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)[::-1]}, + ) + return (lhs * coeffs).sum(degree_dim) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 41fa27615fe..6e7c86c05fd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3241,12 +3241,28 @@ def polyfit( self, dim: Hashable, deg: int, + skipna: bool = None, rcond: float = None, w: Union[Hashable, Any] = None, full: bool = False, cov: bool = False, - skipna: bool = None, ): + """Least squares polynomial fit. + + This replicates the behaviour of `numpy.polyfit` but differs by skipping + invalid values when `skipna = True`. + + Parameters + ---------- + dim : hashable + Coordinate along which to fit the polynomials. + skipna : bool, optional + If True, removes all invalid values before fitting each 1D slices of the array + Defaults is True if data is stored in as dask.array or if there is any + invalid values, False otherwise. + + Documentation of `numpy.polyfit` follows: + """ if skipna is None: if isinstance(self.data, pycompat.dask_array_type): skipna = True @@ -3255,7 +3271,7 @@ def polyfit( x = get_clean_interp_index(self, dim) order = int(deg) + 1 - lhs = np.vander(x, order, increasing=True) + lhs = np.vander(x, order) dims_to_stack = [dimname for dimname in self.dims if dimname != dim] stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") @@ -3293,7 +3309,7 @@ def polyfit( coeffs = DataArray( coeffs / scale[:, np.newaxis], dims=(degree_dim, stacked_dim), - coords={degree_dim: np.arange(order), stacked_dim: rhs[stacked_dim]}, + coords={degree_dim: np.arange(order)[::-1], stacked_dim: rhs[stacked_dim]}, name=name + "_polyfit_coefficients", ).unstack(stacked_dim) @@ -3315,7 +3331,7 @@ def polyfit( sing = DataArray( sing, dims=(degree_dim,), - coords={degree_dim: np.arange(order)}, + coords={degree_dim: np.arange(order)[::-1]}, name=name + "_polyfit_singular_values", ) return coeffs, residuals, rank, sing, rcond @@ -3344,29 +3360,7 @@ def polyfit( str = property(StringAccessor) -def polyval( - coord: Union["DataArray", Any], - coefficients: Union["DataArray", Any], - degree_dim: Union[str, int] = "degree", -) -> "DataArray": - if not isinstance(coord, DataArray): - coord = DataArray(coord, dims=("x",), name="x") - x = get_clean_interp_index(coord, coord.name) - - if isinstance(degree_dim, str): - deg_coord = coefficients[degree_dim] - else: # Axis number - if isinstance(coefficients, DataArray): - deg_coord = coefficients[coefficients.dims[degree_dim]] - else: - deg_coord = np.arange(coefficients.shape[degree_dim]) - - lhs = DataArray( - np.vander(x, int(deg_coord.max()) + 1, increasing=True), - dims=(coord.name, degree_dim), - coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)}, - ) - return (lhs * coefficients).sum(degree_dim) +DataArray.polyfit.__doc__ += np.polyfit.__doc__ # priority most be higher than Variable to properly work with binary ufuncs diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 369903552ad..f41b87db594 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1120,3 +1120,15 @@ def test_where(): actual = xr.where(cond, 1, 0) expected = xr.DataArray([1, 0], dims="x") assert_identical(expected, actual) + + +def test_polyval(): + x = np.arange(10) + da = xr.DataArray( + np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + dims=("d", "x"), + coords={"x": x, "d": [0, 1]}, + ) + coeffs = da.polyfit("x", 2) + da_pv = xr.polyval(da.x, coeffs) + xr.testing.assert_allclose(da, da_pv.T) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index f7863083cb7..755d57d6216 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4155,27 +4155,22 @@ def test_rank(self): def test_polyfit(self): x = np.arange(10) da = DataArray( - np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + np.stack((1.0 + 1.0 * x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), dims=("d", "x"), coords={"x": x, "d": [0, 1]}, ) coeffs = da.polyfit("x", 2) expected = DataArray( - [[1, 1, 2], [1, 2, 3]], + [[2, 1, 1], [3, 2, 1]], dims=("d", "degree"), - coords={"degree": [0, 1, 2], "d": [0, 1]}, + coords={"degree": [2, 1, 0], "d": [0, 1]}, ).T assert_allclose(coeffs, expected) # With NaN da[0, 1] = np.nan - coeffs = da.polyfit("x", 2) - expected_na = DataArray( - [[np.nan, np.nan, np.nan], [1, 2, 3]], - dims=("d", "degree"), - coords={"degree": [0, 1, 2], "d": [0, 1]}, - ).T - assert_allclose(coeffs, expected_na) + coeffs = da.polyfit("x", 2, skipna=True) + assert_allclose(coeffs, expected) # Skipna + Full output coeffs, resids, rank, sing, rcond = da.polyfit("x", 2, skipna=True, full=True) @@ -4184,18 +4179,6 @@ def test_polyfit(self): np.testing.assert_almost_equal(resids, [0, 0]) -def test_polyval(): - x = np.arange(10) - da = DataArray( - np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), - dims=("d", "x"), - coords={"x": x, "d": [0, 1]}, - ) - coeffs = da.polyfit("x", 2) - da_pv = xr.core.dataarray.polyval(da.x, coeffs) - assert_allclose(da, da_pv.T) - - @pytest.fixture(params=[1]) def da(request): if request.param == 1: From 1f6030a09a113f8c119d1593797a74ce62561efd Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 31 Jan 2020 13:16:39 -0500 Subject: [PATCH 06/20] Move doc patching to class --- xarray/__init__.py | 2 +- xarray/core/dataarray.py | 10 +++++----- xarray/core/dataset.py | 2 +- xarray/core/missing.py | 2 +- xarray/core/nputils.py | 1 - xarray/tests/test_duck_array_ops.py | 10 +++++----- xarray/tests/test_missing.py | 5 ++--- 7 files changed, 15 insertions(+), 17 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 28bb901b251..0f6a759e356 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -15,7 +15,7 @@ from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, dot, where, polyval +from .core.computation import apply_ufunc, dot, polyval, where from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 6e7c86c05fd..0938b930ea9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1,3 +1,4 @@ +import datetime import functools import warnings from numbers import Number @@ -18,7 +19,6 @@ cast, ) -import datetime import numpy as np import pandas as pd @@ -3247,7 +3247,8 @@ def polyfit( full: bool = False, cov: bool = False, ): - """Least squares polynomial fit. + """ + Least squares polynomial fit. This replicates the behaviour of `numpy.polyfit` but differs by skipping invalid values when `skipna = True`. @@ -3355,13 +3356,12 @@ def polyfit( return coeffs, covariance return coeffs + polyfit.__doc__ += np.polyfit.__doc__ + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = property(StringAccessor) -DataArray.polyfit.__doc__ += np.polyfit.__doc__ - - # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c72ed6cc7d6..07bea6dac19 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1,4 +1,5 @@ import copy +import datetime import functools import sys import warnings @@ -27,7 +28,6 @@ cast, ) -import datetime import numpy as np import pandas as pd diff --git a/xarray/core/missing.py b/xarray/core/missing.py index b20441e993c..40f010b3514 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,8 +1,8 @@ +import datetime as dt import warnings from functools import partial from numbers import Number from typing import Any, Callable, Dict, Hashable, Sequence, Union -import datetime as dt import numpy as np import pandas as pd diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 201dcd9ffb9..394b130e93f 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd - from numpy.core.multiarray import normalize_axis_index try: diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 96c883baa67..497a1348be1 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -1,7 +1,7 @@ +import datetime as dt import warnings from textwrap import dedent -import datetime as dt import numpy as np import pandas as pd import pytest @@ -17,13 +17,13 @@ gradient, last, mean, - rolling_window, - stack, - where, - py_timedelta_to_float, np_timedelta64_to_float, pd_timedelta_to_float, + py_timedelta_to_float, + rolling_window, + stack, timedelta_to_numeric, + where, ) from xarray.core.pycompat import dask_array_type from xarray.testing import assert_allclose, assert_equal diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 8d70d9a0fcc..35c71c2854c 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -14,16 +14,15 @@ ) from xarray.core.pycompat import dask_array_type from xarray.tests import ( + assert_allclose, assert_array_equal, assert_equal, - assert_allclose, raises_regex, requires_bottleneck, + requires_cftime, requires_dask, requires_scipy, - requires_cftime, ) - from xarray.tests.test_cftime_offsets import _CFTIME_CALENDARS From 0f922e7afbfc628ee0c28e4c1678aefd7c59a02c Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 31 Jan 2020 15:58:59 -0500 Subject: [PATCH 07/20] conditional doc patching --- xarray/core/dataarray.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0938b930ea9..8ba3c5536ea 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3356,7 +3356,8 @@ def polyfit( return coeffs, covariance return coeffs - polyfit.__doc__ += np.polyfit.__doc__ + if polyfit.__doc__ is not None: + polyfit.__doc__ += np.polyfit.__doc__ # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names From bfec9c8b58258cbbfdc0f7739ee7cf3c28ed858a Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 31 Jan 2020 16:48:29 -0500 Subject: [PATCH 08/20] Fix windows fail - more efficient nan skipping --- xarray/core/nputils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 394b130e93f..b7e9fa3c9b0 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -229,14 +229,19 @@ def _nanpolyfit_1d(arr, x, rcond=None): def least_squares(lhs, rhs, rcond=None, skipna=False): - coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs.data, rcond=rcond) if skipna: - nan_cols = np.nonzero(np.isnan(coeffs[0, :]))[0] - out = np.apply_along_axis( - _nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs, rcond=rcond + nan_cols = np.any(np.isnan(rhs.data), axis=0) + out = np.empty((lhs.shape + 1, rhs.data.shape[1])) + out[:, nan_cols] = np.apply_along_axis( + _nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs ) - coeffs[:, nan_cols] = out[:-1, :] - residuals[nan_cols] = out[-1, :] + out[:-1, ~nan_cols], out[-1, ~nan_cols], _, _ = np.linalg.lstsq( + lhs, rhs.data[:, ~nan_cols], rcond=rcond + ) + coeffs = out[:-1, :] + residuals = out[-1, :] + else: + coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs.data, rcond=rcond) return coeffs, residuals From 2d3235a84a034d5106b7bd7f020896a18b575af8 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 31 Jan 2020 17:31:57 -0500 Subject: [PATCH 09/20] Fix typo in least_squares --- xarray/core/nputils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index b7e9fa3c9b0..a069d6d9a18 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -231,7 +231,7 @@ def _nanpolyfit_1d(arr, x, rcond=None): def least_squares(lhs, rhs, rcond=None, skipna=False): if skipna: nan_cols = np.any(np.isnan(rhs.data), axis=0) - out = np.empty((lhs.shape + 1, rhs.data.shape[1])) + out = np.empty((lhs.shape[1] + 1, rhs.data.shape[1])) out[:, nan_cols] = np.apply_along_axis( _nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs ) From 31440aec91afe47c32c9af266ae4751b17d51f05 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 11 Feb 2020 17:53:10 -0500 Subject: [PATCH 10/20] Move polyfit to dataset --- xarray/core/dataarray.py | 106 ++-------------------- xarray/core/dataset.py | 150 +++++++++++++++++++++++++++++++ xarray/tests/test_computation.py | 2 +- xarray/tests/test_dataarray.py | 16 ++-- 4 files changed, 168 insertions(+), 106 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8ba3c5536ea..e2d698d61fa 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -26,12 +26,10 @@ from . import ( computation, dtypes, - duck_array_ops, groupby, indexing, ops, pdcompat, - pycompat, resample, rolling, utils, @@ -56,7 +54,6 @@ from .indexes import Indexes, default_indexes, propagate_indexes from .indexing import is_fancy_indexer from .merge import PANDAS_TYPES, _extract_indexes_from_coords -from .missing import get_clean_interp_index from .options import OPTIONS from .utils import Default, ReprObject, _check_inplace, _default, either_dict_or_kwargs from .variable import ( @@ -3262,103 +3259,18 @@ def polyfit( Defaults is True if data is stored in as dask.array or if there is any invalid values, False otherwise. - Documentation of `numpy.polyfit` follows: + See documentation of `numpy.polyfit`. + + Returns + ------- + A single dataset with the coefficients of the best fit for this DataArray. + The residuals, singular values and rank are included if `full` is True. + The covariance matrix is included if `full` is False and `cov` is True. """ - if skipna is None: - if isinstance(self.data, pycompat.dask_array_type): - skipna = True - else: - skipna = np.any(self.isnull()) - - x = get_clean_interp_index(self, dim) - order = int(deg) + 1 - lhs = np.vander(x, order) - - dims_to_stack = [dimname for dimname in self.dims if dimname != dim] - stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") - rhs = self.transpose(dim, *dims_to_stack).stack({stacked_dim: dims_to_stack}) - - if rcond is None: - rcond = x.shape[0] * np.core.finfo(x.dtype).eps - - # Weights: - if w is not None: - if isinstance(w, Hashable): - w = self[w] - w = np.asarray(w) - if w.ndim != 1: - raise TypeError("Expected a 1-d array for weights.") - if w.shape[0] != lhs.shape[0]: - raise TypeError("Expected w and {} to have the same length".format(dim)) - lhs *= w[:, np.newaxis] - rhs *= w[:, np.newaxis] - - # Scaling - scale = np.sqrt((lhs * lhs).sum(axis=0)) - lhs /= scale - - coeffs, residuals = duck_array_ops.least_squares( - lhs, rhs, rcond=rcond, skipna=skipna + return self._to_temp_dataset().polyfit( + dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov ) - if self.name: - name = "{}_".format(self.name) - else: - name = "" - - degree_dim = utils.get_temp_dimname(dims_to_stack, "degree") - coeffs = DataArray( - coeffs / scale[:, np.newaxis], - dims=(degree_dim, stacked_dim), - coords={degree_dim: np.arange(order)[::-1], stacked_dim: rhs[stacked_dim]}, - name=name + "_polyfit_coefficients", - ).unstack(stacked_dim) - - rank = np.linalg.matrix_rank(lhs) - if rank != order and not full: - warnings.warn( - "Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4 - ) - - if full or (cov is True): - residuals = DataArray( - residuals, - dims=(stacked_dim), - coords={stacked_dim: rhs[stacked_dim]}, - name=name + "_polyfit_residuals", - ).unstack(stacked_dim) - if full: - sing = np.linalg.svd(lhs, compute_uv=False) - sing = DataArray( - sing, - dims=(degree_dim,), - coords={degree_dim: np.arange(order)[::-1]}, - name=name + "_polyfit_singular_values", - ) - return coeffs, residuals, rank, sing, rcond - if cov: - Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) - Vbase /= np.outer(scale, scale) - if cov == "unscaled": - fac = 1 - else: - if x.shape[0] <= order: - raise ValueError( - "The number of data points must exceed order to scale the covariance matrix." - ) - fac = residuals / (x.shape[0] - order) - covariance = ( - DataArray( - Vbase, dims=("cov_i", "cov_j"), name=name + "_polyfit_covariance", - ) - * fac - ) - return coeffs, covariance - return coeffs - - if polyfit.__doc__ is not None: - polyfit.__doc__ += np.polyfit.__doc__ - # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = property(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 07bea6dac19..0d56fb0ed90 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -75,6 +75,7 @@ merge_coordinates_without_align, merge_data_and_coords, ) +from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs from .pycompat import dask_array_type from .utils import ( @@ -5690,5 +5691,154 @@ def map_blocks( return map_blocks(func, self, args, kwargs) + def polyfit( + self, + dim: Hashable, + deg: int, + skipna: bool = None, + rcond: float = None, + w: Union[Hashable, Any] = None, + full: bool = False, + cov: bool = False, + ): + """ + Least squares polynomial fit. + + This replicates the behaviour of `numpy.polyfit` but differs by skipping + invalid values when `skipna = True`. + + Parameters + ---------- + dim : hashable + Coordinate along which to fit the polynomials. + skipna : bool, optional + If True, removes all invalid values before fitting each 1D slices of the array + Default is True if data is stored in as dask.array or if there is any + invalid values, False otherwise. + + See documentation of `numpy.polyfit`. + + Returns + ------- + A single dataset with the coefficients of the best fit for each variable in this dataset. + The residuals, singular values and rank are included if `full` is True. + The covariance matrices are included if `full` is False and `cov` is True. + """ + variables = {} + skipna_da = skipna + + x = get_clean_interp_index(self, dim) + xname = "{}_".format(self[dim].name) + order = int(deg) + 1 + lhs = np.vander(x, order) + + if rcond is None: + rcond = x.shape[0] * np.core.finfo(x.dtype).eps + + # Weights: + if w is not None: + if isinstance(w, Hashable): + w = self.coords[w] + w = np.asarray(w) + if w.ndim != 1: + raise TypeError("Expected a 1-d array for weights.") + if w.shape[0] != lhs.shape[0]: + raise TypeError("Expected w and {} to have the same length".format(dim)) + lhs *= w[:, np.newaxis] + + # Scaling + scale = np.sqrt((lhs * lhs).sum(axis=0)) + lhs /= scale + + degree_dim = utils.get_temp_dimname(self.dims, "degree") + + rank = np.linalg.matrix_rank(lhs) + if rank != order and not full: + warnings.warn( + "Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4 + ) + + if full: + rank = xr.DataArray(rank, name=xname + "matrix_rank") + variables[rank.name] = rank + sing = np.linalg.svd(lhs, compute_uv=False) + sing = xr.DataArray( + sing, + dims=(degree_dim,), + coords={degree_dim: np.arange(order)[::-1]}, + name=xname + "singular_values", + ) + variables[sing.name] = sing + + for name, da in self.data_vars.items(): + if dim not in da.coords: + continue + + if skipna is None: + if isinstance(da.data, dask_array_type): + skipna_da = True + else: + skipna_da = np.any(da.isnull()) + + dims_to_stack = [dimname for dimname in da.dims if dimname != dim] + stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") + rhs = da.transpose(dim, *dims_to_stack).stack({stacked_dim: dims_to_stack}) + + if w is not None: + rhs *= w[:, np.newaxis] + + coeffs, residuals = duck_array_ops.least_squares( + lhs, rhs, rcond=rcond, skipna=skipna_da + ) + + if isinstance(name, str): + name = "{}_".format(name) + else: + # Thus a ReprObject => polyfit was called on a DataArray + name = "" + + coeffs = xr.DataArray( + coeffs / scale[:, np.newaxis], + dims=(degree_dim, stacked_dim), + coords={ + degree_dim: np.arange(order)[::-1], + stacked_dim: rhs[stacked_dim], + }, + name=name + "polyfit_coefficients", + ).unstack(stacked_dim) + variables[coeffs.name] = coeffs + + if full or (cov is True): + residuals = xr.DataArray( + residuals, + dims=(stacked_dim), + coords={stacked_dim: rhs[stacked_dim]}, + name=name + "polyfit_residuals", + ).unstack(stacked_dim) + variables[residuals.name] = residuals + + if cov: + Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) + Vbase /= np.outer(scale, scale) + if cov == "unscaled": + fac = 1 + else: + if x.shape[0] <= order: + raise ValueError( + "The number of data points must exceed order to scale the covariance matrix." + ) + fac = residuals / (x.shape[0] - order) + covariance = ( + xr.DataArray( + Vbase, + dims=("cov_i", "cov_j"), + name=name + "polyfit_covariance", + ) + * fac + ) + variables[covariance.name] = covariance + + return Dataset(data_vars=variables, attrs=self.attrs.copy()) + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index f41b87db594..8d9e72db8bb 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1130,5 +1130,5 @@ def test_polyval(): coords={"x": x, "d": [0, 1]}, ) coeffs = da.polyfit("x", 2) - da_pv = xr.polyval(da.x, coeffs) + da_pv = xr.polyval(da.x, coeffs.polyfit_coefficients) xr.testing.assert_allclose(da, da_pv.T) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 755d57d6216..5b50d1d9adb 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4159,24 +4159,24 @@ def test_polyfit(self): dims=("d", "x"), coords={"x": x, "d": [0, 1]}, ) - coeffs = da.polyfit("x", 2) + out = da.polyfit("x", 2) expected = DataArray( [[2, 1, 1], [3, 2, 1]], dims=("d", "degree"), coords={"degree": [2, 1, 0], "d": [0, 1]}, ).T - assert_allclose(coeffs, expected) + assert_allclose(out.polyfit_coefficients, expected) # With NaN da[0, 1] = np.nan - coeffs = da.polyfit("x", 2, skipna=True) - assert_allclose(coeffs, expected) + out = da.polyfit("x", 2, skipna=True) + assert_allclose(out.polyfit_coefficients, expected) # Skipna + Full output - coeffs, resids, rank, sing, rcond = da.polyfit("x", 2, skipna=True, full=True) - assert_allclose(coeffs, expected) - assert rank == np.linalg.matrix_rank(np.vander(x, 3)) - np.testing.assert_almost_equal(resids, [0, 0]) + out = da.polyfit("x", 2, skipna=True, full=True) + assert_allclose(out.polyfit_coefficients, expected) + assert out.x_matrix_rank == np.linalg.matrix_rank(np.vander(x, 3)) + np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0]) @pytest.fixture(params=[1]) From da831a5de27a35b15ac2d144bc4dc557036742a9 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 12 Feb 2020 12:34:53 -0500 Subject: [PATCH 11/20] Add more tests | fix some edge cases --- xarray/core/dask_array_ops.py | 12 ++++++-- xarray/core/dataset.py | 15 +++------- xarray/core/nputils.py | 26 +++++++++++------ xarray/tests/test_computation.py | 31 ++++++++++++++++---- xarray/tests/test_dataarray.py | 45 +++++++++++++++++++++-------- xarray/tests/test_dataset.py | 13 +++++++++ xarray/tests/test_duck_array_ops.py | 18 ++++++++++++ 7 files changed, 120 insertions(+), 40 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 961de63f664..87f646352eb 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -102,17 +102,23 @@ def least_squares(lhs, rhs, rcond=None, skipna=False): lhs_da = da.from_array(lhs, chunks=(rhs.chunks[0], lhs.shape[1])) if skipna: + added_dim = rhs.ndim == 1 + if added_dim: + rhs = rhs.reshape(rhs.shape[0], 1) results = da.apply_along_axis( nputils._nanpolyfit_1d, 0, - rhs.data, - lhs_da.data, + rhs, + lhs_da, dtype=float, shape=(lhs.shape[1] + 1,), rcond=rcond, ) coeffs = results[:-1, ...] residuals = results[-1, ...] + if added_dim: + coeffs = coeffs.reshape(coeffs.shape[0]) + residuals = residuals.reshape(residuals.shape[0]) else: - coeffs, residuals, _, _ = da.linalg.lstsq(lhs_da.data, rhs.data) + coeffs, residuals, _, _ = da.linalg.lstsq(lhs_da, rhs) return coeffs, residuals diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0d56fb0ed90..4ebd2fc6f0f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5771,7 +5771,7 @@ def polyfit( variables[sing.name] = sing for name, da in self.data_vars.items(): - if dim not in da.coords: + if dim not in da.dims: continue if skipna is None: @@ -5788,7 +5788,7 @@ def polyfit( rhs *= w[:, np.newaxis] coeffs, residuals = duck_array_ops.least_squares( - lhs, rhs, rcond=rcond, skipna=skipna_da + lhs, rhs.data, rcond=rcond, skipna=skipna_da ) if isinstance(name, str): @@ -5828,15 +5828,8 @@ def polyfit( "The number of data points must exceed order to scale the covariance matrix." ) fac = residuals / (x.shape[0] - order) - covariance = ( - xr.DataArray( - Vbase, - dims=("cov_i", "cov_j"), - name=name + "polyfit_covariance", - ) - * fac - ) - variables[covariance.name] = covariance + covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j"),) * fac + variables[name + "polyfit_covariance"] = covariance return Dataset(data_vars=variables, attrs=self.attrs.copy()) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index a069d6d9a18..b2cd0ba9b72 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -230,18 +230,26 @@ def _nanpolyfit_1d(arr, x, rcond=None): def least_squares(lhs, rhs, rcond=None, skipna=False): if skipna: - nan_cols = np.any(np.isnan(rhs.data), axis=0) - out = np.empty((lhs.shape[1] + 1, rhs.data.shape[1])) - out[:, nan_cols] = np.apply_along_axis( - _nanpolyfit_1d, 0, rhs.data[:, nan_cols], lhs - ) - out[:-1, ~nan_cols], out[-1, ~nan_cols], _, _ = np.linalg.lstsq( - lhs, rhs.data[:, ~nan_cols], rcond=rcond - ) + added_dim = rhs.ndim == 1 + if added_dim: + rhs = rhs.reshape(rhs.shape[0], 1) + nan_cols = np.any(np.isnan(rhs), axis=0) + out = np.empty((lhs.shape[1] + 1, rhs.shape[1])) + if np.any(nan_cols): + out[:, nan_cols] = np.apply_along_axis( + _nanpolyfit_1d, 0, rhs[:, nan_cols], lhs + ) + if np.any(~nan_cols): + out[:-1, ~nan_cols], out[-1, ~nan_cols], _, _ = np.linalg.lstsq( + lhs, rhs[:, ~nan_cols], rcond=rcond + ) coeffs = out[:-1, :] residuals = out[-1, :] + if added_dim: + coeffs = coeffs.reshape(coeffs.shape[0]) + residuals = residuals.reshape(residuals.shape[0]) else: - coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs.data, rcond=rcond) + coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs, rcond=rcond) return coeffs, residuals diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 8d9e72db8bb..01cd4d526fb 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1122,13 +1122,34 @@ def test_where(): assert_identical(expected, actual) -def test_polyval(): - x = np.arange(10) +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.parametrize("use_datetime", [True, False]) +@pytest.mark.parametrize("provide_coord", [True, False]) +def test_polyval(use_dask, use_datetime, provide_coord): + if use_datetime: + xcoord = xr.DataArray( + pd.date_range("2000-01-01", freq="D", periods=10), dims=("x",), name="x" + ) + x = xr.core.missing.get_clean_interp_index(xcoord, "x") + else: + xcoord = x = np.arange(10) + da = xr.DataArray( np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), dims=("d", "x"), - coords={"x": x, "d": [0, 1]}, + coords={"x": xcoord, "d": [0, 1]}, ) - coeffs = da.polyfit("x", 2) - da_pv = xr.polyval(da.x, coeffs.polyfit_coefficients) + coeffs = xr.DataArray( + [[2, 1, 1], [3, 2, 1]], + dims=("d", "degree"), + coords={"d": [0, 1], "degree": [2, 1, 0]}, + ) + if use_dask: + coeffs = coeffs.chunk({"d": 2}) + if provide_coord: + da_pv = xr.polyval(da.x, coeffs) + else: + da_pv = xr.polyval(da.x, coeffs.values, degree_dim=1) + da_pv = da_pv.rename(dim_0="d") + da_pv["d"] = [0, 1] xr.testing.assert_allclose(da, da_pv.T) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5b50d1d9adb..c707b625ab6 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4152,30 +4152,51 @@ def test_rank(self): y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=("z",)) assert_equal(y.rank("z", pct=True), y) - def test_polyfit(self): - x = np.arange(10) - da = DataArray( - np.stack((1.0 + 1.0 * x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + @pytest.mark.parametrize("use_dask", [True, False]) + @pytest.mark.parametrize("use_datetime", [True, False]) + def test_polyfit(self, use_dask, use_datetime): + xcoord = xr.DataArray( + pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" + ) + x = xr.core.missing.get_clean_interp_index(xcoord, "x") + if not use_datetime: + xcoord = x + + da_raw = DataArray( + np.stack( + (10 + 1e-15 * x + 2e-28 * x ** 2, 30 + 2e-14 * x + 1e-29 * x ** 2) + ), dims=("d", "x"), - coords={"x": x, "d": [0, 1]}, + coords={"x": xcoord, "d": [0, 1]}, ) + + if use_dask: + da = da_raw.chunk({"d": 1}) + else: + da = da_raw + out = da.polyfit("x", 2) expected = DataArray( - [[2, 1, 1], [3, 2, 1]], + [[2e-28, 1e-15, 10], [1e-29, 2e-14, 30]], dims=("d", "degree"), coords={"degree": [2, 1, 0], "d": [0, 1]}, ).T - assert_allclose(out.polyfit_coefficients, expected) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) # With NaN - da[0, 1] = np.nan - out = da.polyfit("x", 2, skipna=True) - assert_allclose(out.polyfit_coefficients, expected) + da_raw[0, 1] = np.nan + if use_dask: + da = da_raw.chunk({"d": 1}) + else: + da = da_raw + out = da.polyfit("x", 2, skipna=True, cov=True) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + assert "polyfit_covariance" in out # Skipna + Full output out = da.polyfit("x", 2, skipna=True, full=True) - assert_allclose(out.polyfit_coefficients, expected) - assert out.x_matrix_rank == np.linalg.matrix_rank(np.vander(x, 3)) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + assert out.x_matrix_rank == 3 np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 4e51e229b29..aaa3e8bb645 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5438,6 +5438,19 @@ def test_ipython_key_completion(self): ds.data_vars[item] # should not raise assert sorted(actual) == sorted(expected) + def test_polyfit_output(self): + ds = create_test_data(seed=1) + + out = ds.polyfit("dim2", 2, full=False) + assert "var1_polyfit_coefficients" in out + + out = ds.polyfit("dim1", 2, full=True) + assert "var1_polyfit_coefficients" in out + assert "dim1_matrix_rank" in out + + out = ds.polyfit("time", 2) + assert len(out.data_vars) == 0 + # Py.test tests diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 497a1348be1..9d8a8cbc27f 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -16,6 +16,7 @@ first, gradient, last, + least_squares, mean, np_timedelta64_to_float, pd_timedelta_to_float, @@ -760,3 +761,20 @@ def test_timedelta_to_numeric(td): out = timedelta_to_numeric(td, "ns") np.testing.assert_allclose(out, 86400 * 1e9) assert isinstance(out, float) + + +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_least_squares(use_dask, skipna): + if use_dask and not has_dask: + pytest.skip("requires dask") + lhs = np.array([[1, 2], [1, 2], [3, 2]]) + rhs = DataArray(np.array([3, 5, 7]), dims=("y",)) + + if use_dask: + rhs = rhs.chunk({"y": 1}) + + coeffs, residuals = least_squares(lhs, rhs.data, skipna=skipna) + + np.testing.assert_allclose(coeffs, [1.5, 1.25]) + np.testing.assert_allclose(residuals, [2.0]) From f13123fff2beedb087136e342d8574873a16733d Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 12 Feb 2020 13:06:01 -0500 Subject: [PATCH 12/20] Skip test without dask --- xarray/tests/test_dataarray.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c707b625ab6..fc6e999b1fe 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -23,6 +23,7 @@ assert_array_equal, assert_equal, assert_identical, + has_dask, raises_regex, requires_bottleneck, requires_dask, @@ -4155,6 +4156,8 @@ def test_rank(self): @pytest.mark.parametrize("use_dask", [True, False]) @pytest.mark.parametrize("use_datetime", [True, False]) def test_polyfit(self, use_dask, use_datetime): + if use_dask and not has_dask: + pytest.skip("requires dask") xcoord = xr.DataArray( pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" ) From 896313fa4d94fd18510be2a8255721cf9815ca2b Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 12 Feb 2020 13:57:54 -0500 Subject: [PATCH 13/20] Fix 1D case | add docs --- doc/api.rst | 3 +++ doc/computation.rst | 26 ++++++++++++++++++++++++++ doc/whats-new.rst | 2 ++ xarray/core/dataset.py | 36 +++++++++++++++++++++++------------- 4 files changed, 54 insertions(+), 13 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 9735eb0c708..857e12d4227 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -30,6 +30,7 @@ Top-level functions zeros_like ones_like dot + polyval map_blocks show_versions set_options @@ -171,6 +172,7 @@ Computation Dataset.quantile Dataset.differentiate Dataset.integrate + Dataset.polyfit **Aggregation**: :py:attr:`~Dataset.all` @@ -349,6 +351,7 @@ Computation DataArray.quantile DataArray.differentiate DataArray.integrate + DataArray.polyfit DataArray.str **Aggregation**: diff --git a/doc/computation.rst b/doc/computation.rst index 1ac30f55ee7..3c6d90d413e 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -317,6 +317,32 @@ trapezoidal rule using their coordinates, and integration along multidimensional coordinate are not supported. +.. _compute.polyfit: + +Fitting polynomials +=================== + +Xarray objects provide an interface for performing linear or polynomial regressions +using the least-squares method. :py:meth:`~xarray.DataArray.polyfit` computes the +best fitting coefficients along a given dimension and for a given order, + +.. ipython:: python + + x = np.arange(10) + a = xr.DataArray(3 + 4 * x, dims=['x'], coords={'x': x}) + out = a.polyfit('x', 1, full=True) + out + +The method outputs a dataset containing the coefficients (and more if `full = True`). +The inverse operation is done with :py:meth:`~xarray.polyval`, + +.. ipython::python + + xr.polyval(x, out.polyfit_coefficients) + +.. note:: + These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`. + .. _compute.broadcasting: Broadcasting by dimension name diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4aad6ad3701..926681c8f6d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,6 +40,8 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) + By `Pascal Bourgault `_. - :py:meth:`DataArray.sel` and :py:meth:`Dataset.sel` now support :py:class:`pandas.CategoricalIndex`. (:issue:`3669`) By `Keisuke Fujii `_. - Support using an existing, opened h5netcdf ``File`` with diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4ebd2fc6f0f..f76ccf41e29 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5781,8 +5781,17 @@ def polyfit( skipna_da = np.any(da.isnull()) dims_to_stack = [dimname for dimname in da.dims if dimname != dim] - stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") - rhs = da.transpose(dim, *dims_to_stack).stack({stacked_dim: dims_to_stack}) + stacked_coords = {} + if dims_to_stack: + stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") + rhs = da.transpose(dim, *dims_to_stack).stack( + {stacked_dim: dims_to_stack} + ) + stacked_coords = {stacked_dim: rhs[stacked_dim]} + scale_da = scale[:, np.newaxis] + else: + rhs = da + scale_da = scale if w is not None: rhs *= w[:, np.newaxis] @@ -5798,23 +5807,24 @@ def polyfit( name = "" coeffs = xr.DataArray( - coeffs / scale[:, np.newaxis], - dims=(degree_dim, stacked_dim), - coords={ - degree_dim: np.arange(order)[::-1], - stacked_dim: rhs[stacked_dim], - }, + coeffs / scale_da, + dims=[degree_dim] + list(stacked_coords.keys()), + coords={degree_dim: np.arange(order)[::-1], **stacked_coords}, name=name + "polyfit_coefficients", - ).unstack(stacked_dim) + ) + if dims_to_stack: + coeffs = coeffs.unstack(stacked_dim) variables[coeffs.name] = coeffs if full or (cov is True): residuals = xr.DataArray( - residuals, - dims=(stacked_dim), - coords={stacked_dim: rhs[stacked_dim]}, + residuals if dims_to_stack else residuals.squeeze(), + dims=list(stacked_coords.keys()), + coords=stacked_coords, name=name + "polyfit_residuals", - ).unstack(stacked_dim) + ) + if dims_to_stack: + residuals = residuals.unstack(stacked_dim) variables[residuals.name] = residuals if cov: From 5b709716c79b03776aae9b3b25a3df0cb6d68f34 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Wed, 12 Feb 2020 14:33:28 -0500 Subject: [PATCH 14/20] skip polyval test without dask --- xarray/tests/test_computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 01cd4d526fb..f01ebb5759a 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1126,6 +1126,8 @@ def test_where(): @pytest.mark.parametrize("use_datetime", [True, False]) @pytest.mark.parametrize("provide_coord", [True, False]) def test_polyval(use_dask, use_datetime, provide_coord): + if use_dask and not has_dask: + pytest.skip("requires dask") if use_datetime: xcoord = xr.DataArray( pd.date_range("2000-01-01", freq="D", periods=10), dims=("x",), name="x" From 5af3c64190ace1d6216d06473bc672851ddc7858 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 13 Feb 2020 13:55:54 -0500 Subject: [PATCH 15/20] Explicit docs | More restrictive polyval --- doc/computation.rst | 4 ++-- xarray/core/computation.py | 24 +++++------------------- xarray/core/dataarray.py | 24 ++++++++++++++++++++---- xarray/core/dataset.py | 24 ++++++++++++++++++++---- xarray/tests/test_computation.py | 13 +++++-------- 5 files changed, 52 insertions(+), 37 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 3c6d90d413e..84b4d6002a6 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -330,7 +330,7 @@ best fitting coefficients along a given dimension and for a given order, x = np.arange(10) a = xr.DataArray(3 + 4 * x, dims=['x'], coords={'x': x}) - out = a.polyfit('x', 1, full=True) + out = a.polyfit(dim='x', deg=1, full=True) out The method outputs a dataset containing the coefficients (and more if `full = True`). @@ -338,7 +338,7 @@ The inverse operation is done with :py:meth:`~xarray.polyval`, .. ipython::python - xr.polyval(x, out.polyfit_coefficients) + xr.polyval(coord=x, coeffs=out.polyfit_coefficients) .. note:: These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`. diff --git a/xarray/core/computation.py b/xarray/core/computation.py index d27b1562857..9096b9fa1d4 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1306,12 +1306,12 @@ def polyval(coord, coeffs, degree_dim="degree"): Parameters ---------- - coord : DataArray or array-like + coord : DataArray The 1D coordinate along which to evaluate the polynomial. - coeffs : DataArray or array-like + coeffs : DataArray Coefficients of the polynomials. - degree_dim : str or int, optional - Name of dimension or axis number along which the degree increases in `coeffs`. + degree_dim : str, optional + Name of the polynomial degree dimension in `coeffs`. See also -------- @@ -1321,23 +1321,9 @@ def polyval(coord, coeffs, degree_dim="degree"): from .dataarray import DataArray from .missing import get_clean_interp_index - if not isinstance(coord, DataArray): - coord = DataArray(coord, dims=("x",), name="x") x = get_clean_interp_index(coord, coord.name) - if isinstance(degree_dim, str): - deg_coord = coeffs[degree_dim] - else: # Axis number - degree_axis = degree_dim - if isinstance(coeffs, DataArray): - degree_dim = coeffs.dims[degree_axis] - deg_coord = coeffs[degree_dim] - else: - degree_dim = "degree" - deg_coord = np.arange(coeffs.shape[degree_axis])[::-1] - coeffs = DataArray(coeffs) - coeffs.coords[coeffs.dims[degree_axis]] = deg_coord - coeffs = coeffs.rename({coeffs.dims[degree_axis]: degree_dim}) + deg_coord = coeffs[degree_dim] lhs = DataArray( np.vander(x, int(deg_coord.max()) + 1), diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e2d698d61fa..c6a786a9351 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3254,18 +3254,34 @@ def polyfit( ---------- dim : hashable Coordinate along which to fit the polynomials. + deg : int + Degree of the fitting polynomial. skipna : bool, optional If True, removes all invalid values before fitting each 1D slices of the array - Defaults is True if data is stored in as dask.array or if there is any + Default is True if data is stored in as dask.array or if there is any invalid values, False otherwise. + rcond : float, optional + Relative condition number to the fit. + w : Union[Hashable, Any], optional + Weights to apply to the y-coordinate of the sample points. + Can be an array-like object or the name of a coordinate in the dataset. + full : bool, optional + Whether to return the residuals, matrix rank and singular values in addition + to the coefficients. + cov : Union[bool, str], optional + Whether to return to the covariance matrix in addition to the coefficients. + The matrix is not scaled if `cov = 'unscaled'`. See documentation of `numpy.polyfit`. Returns ------- - A single dataset with the coefficients of the best fit for this DataArray. - The residuals, singular values and rank are included if `full` is True. - The covariance matrix is included if `full` is False and `cov` is True. + A single dataset which containts: + polyfit_coefficients : The coefficients of the best fit. + polyfit_residuals : The residuals of the least-square computation (only included if `full=True`) + [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) """ return self._to_temp_dataset().polyfit( dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f76ccf41e29..3675c76e705 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5699,7 +5699,7 @@ def polyfit( rcond: float = None, w: Union[Hashable, Any] = None, full: bool = False, - cov: bool = False, + cov: Union[bool, str] = False, ): """ Least squares polynomial fit. @@ -5711,18 +5711,34 @@ def polyfit( ---------- dim : hashable Coordinate along which to fit the polynomials. + deg : int + Degree of the fitting polynomial. skipna : bool, optional If True, removes all invalid values before fitting each 1D slices of the array Default is True if data is stored in as dask.array or if there is any invalid values, False otherwise. + rcond : float, optional + Relative condition number to the fit. + w : Union[Hashable, Any], optional + Weights to apply to the y-coordinate of the sample points. + Can be an array-like object or the name of a coordinate in the dataset. + full : bool, optional + Whether to return the residuals, matrix rank and singular values in addition + to the coefficients. + cov : Union[bool, str], optional + Whether to return to the covariance matrix in addition to the coefficients. + The matrix is not scaled if `cov = 'unscaled'`. See documentation of `numpy.polyfit`. Returns ------- - A single dataset with the coefficients of the best fit for each variable in this dataset. - The residuals, singular values and rank are included if `full` is True. - The covariance matrices are included if `full` is False and `cov` is True. + A single dataset which containts: + [var_]polyfit_coefficients : The coefficients of the best fit for each variable in this dataset. + [var_]polyfit_residuals : The residuals of the least-square computation for each variable (only included if `full=True`) + [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [var_]polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) """ variables = {} skipna_da = skipna diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index f01ebb5759a..4eed464d2dc 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1124,10 +1124,10 @@ def test_where(): @pytest.mark.parametrize("use_dask", [True, False]) @pytest.mark.parametrize("use_datetime", [True, False]) -@pytest.mark.parametrize("provide_coord", [True, False]) -def test_polyval(use_dask, use_datetime, provide_coord): +def test_polyval(use_dask, use_datetime): if use_dask and not has_dask: pytest.skip("requires dask") + if use_datetime: xcoord = xr.DataArray( pd.date_range("2000-01-01", freq="D", periods=10), dims=("x",), name="x" @@ -1148,10 +1148,7 @@ def test_polyval(use_dask, use_datetime, provide_coord): ) if use_dask: coeffs = coeffs.chunk({"d": 2}) - if provide_coord: - da_pv = xr.polyval(da.x, coeffs) - else: - da_pv = xr.polyval(da.x, coeffs.values, degree_dim=1) - da_pv = da_pv.rename(dim_0="d") - da_pv["d"] = [0, 1] + + da_pv = xr.polyval(da.x, coeffs) + xr.testing.assert_allclose(da, da_pv.T) From 3400dea560267f39d51eedc5159b6728b4d68d71 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 13 Mar 2020 12:39:24 -0400 Subject: [PATCH 16/20] Small typo in polyfit docstrings --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ec77a666894..8668f7fab89 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3269,7 +3269,7 @@ def polyfit( Degree of the fitting polynomial. skipna : bool, optional If True, removes all invalid values before fitting each 1D slices of the array - Default is True if data is stored in as dask.array or if there is any + Default is True if data is stored in a dask.array or if there is any invalid values, False otherwise. rcond : float, optional Relative condition number to the fit. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index eda08eea08a..168f1755f47 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5749,7 +5749,7 @@ def polyfit( Degree of the fitting polynomial. skipna : bool, optional If True, removes all invalid values before fitting each 1D slices of the array - Default is True if data is stored in as dask.array or if there is any + Default is True if data is stored in a dask.array or if there is any invalid values, False otherwise. rcond : float, optional Relative condition number to the fit. From 198ff81eaa540f3f0cb119dba6225ee0b9c7d223 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 13 Mar 2020 13:53:30 -0400 Subject: [PATCH 17/20] Apply suggestions from code review Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/computation.rst | 4 ++-- xarray/core/computation.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 84b4d6002a6..291172edca1 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -333,10 +333,10 @@ best fitting coefficients along a given dimension and for a given order, out = a.polyfit(dim='x', deg=1, full=True) out -The method outputs a dataset containing the coefficients (and more if `full = True`). +The method outputs a dataset containing the coefficients (and more if `full=True`). The inverse operation is done with :py:meth:`~xarray.polyval`, -.. ipython::python +.. ipython:: python xr.polyval(coord=x, coeffs=out.polyfit_coefficients) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 9096b9fa1d4..9aefd8e5d5e 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1310,13 +1310,13 @@ def polyval(coord, coeffs, degree_dim="degree"): The 1D coordinate along which to evaluate the polynomial. coeffs : DataArray Coefficients of the polynomials. - degree_dim : str, optional + degree_dim : str, default "degree" Name of the polynomial degree dimension in `coeffs`. See also -------- - `xarray.DataArray.polyfit` - `numpy.polyval` + xarray.DataArray.polyfit + numpy.polyval """ from .dataarray import DataArray from .missing import get_clean_interp_index From 34726a14127107db93dbfa8e165dfa0227043886 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 13 Mar 2020 16:09:51 -0400 Subject: [PATCH 18/20] Polyfit : fix style in docstring | add see also section --- xarray/core/dataarray.py | 6 +++++- xarray/core/dataset.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8668f7fab89..aeb16eed46f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3281,7 +3281,7 @@ def polyfit( to the coefficients. cov : Union[bool, str], optional Whether to return to the covariance matrix in addition to the coefficients. - The matrix is not scaled if `cov = 'unscaled'`. + The matrix is not scaled if `cov='unscaled'`. See documentation of `numpy.polyfit`. @@ -3293,6 +3293,10 @@ def polyfit( [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + + See also + -------- + numpy.polyfit """ return self._to_temp_dataset().polyfit( dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 168f1755f47..0e883979ddf 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5761,7 +5761,7 @@ def polyfit( to the coefficients. cov : Union[bool, str], optional Whether to return to the covariance matrix in addition to the coefficients. - The matrix is not scaled if `cov = 'unscaled'`. + The matrix is not scaled if `cov='unscaled'`. See documentation of `numpy.polyfit`. @@ -5773,6 +5773,10 @@ def polyfit( [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) [var_]polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + + See also + -------- + numpy.polyfit """ variables = {} skipna_da = skipna From 3f7fe4c43972883c3e7e18eeb68d833c70a57697 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 13 Mar 2020 17:49:31 -0400 Subject: [PATCH 19/20] Clean up docstrings and documentation. --- doc/computation.rst | 2 +- xarray/core/dataarray.py | 23 ++++++++++++++--------- xarray/core/dataset.py | 22 ++++++++++++++-------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 291172edca1..520a2261e31 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -328,7 +328,7 @@ best fitting coefficients along a given dimension and for a given order, .. ipython:: python - x = np.arange(10) + x = xr.DataArray(np.arange(10), dims=['x'], name='x') a = xr.DataArray(3 + 4 * x, dims=['x'], coords={'x': x}) out = a.polyfit(dim='x', deg=1, full=True) out diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index aeb16eed46f..b923f9b15e3 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3268,7 +3268,7 @@ def polyfit( deg : int Degree of the fitting polynomial. skipna : bool, optional - If True, removes all invalid values before fitting each 1D slices of the array + If True, removes all invalid values before fitting each 1D slices of the array. Default is True if data is stored in a dask.array or if there is any invalid values, False otherwise. rcond : float, optional @@ -3283,16 +3283,21 @@ def polyfit( Whether to return to the covariance matrix in addition to the coefficients. The matrix is not scaled if `cov='unscaled'`. - See documentation of `numpy.polyfit`. - Returns ------- - A single dataset which containts: - polyfit_coefficients : The coefficients of the best fit. - polyfit_residuals : The residuals of the least-square computation (only included if `full=True`) - [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) - [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) - polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + polyfit_results : Dataset + A single dataset which contains: + + polyfit_coefficients + The coefficients of the best fit. + polyfit_residuals + The residuals of the least-square computation (only included if `full=True`) + [dim]_matrix_rank + The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_value + The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + polyfit_covariance + The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) See also -------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0e883979ddf..b40773c75f3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5748,7 +5748,7 @@ def polyfit( deg : int Degree of the fitting polynomial. skipna : bool, optional - If True, removes all invalid values before fitting each 1D slices of the array + If True, removes all invalid values before fitting each 1D slices of the array. Default is True if data is stored in a dask.array or if there is any invalid values, False otherwise. rcond : float, optional @@ -5763,16 +5763,22 @@ def polyfit( Whether to return to the covariance matrix in addition to the coefficients. The matrix is not scaled if `cov='unscaled'`. - See documentation of `numpy.polyfit`. Returns ------- - A single dataset which containts: - [var_]polyfit_coefficients : The coefficients of the best fit for each variable in this dataset. - [var_]polyfit_residuals : The residuals of the least-square computation for each variable (only included if `full=True`) - [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) - [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) - [var_]polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + polyfit_results : Dataset + A single dataset which contains (for each "var" in the input dataset): + + [var]_polyfit_coefficients + The coefficients of the best fit for each variable in this dataset. + [var]_polyfit_residuals + The residuals of the least-square computation for each variable (only included if `full=True`) + [dim]_matrix_rank + The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_values + The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [var]_polyfit_covariance + The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) See also -------- From 7eeba59ff487d5bc51809da4ae824e7283b5b2aa Mon Sep 17 00:00:00 2001 From: Phobos Date: Wed, 25 Mar 2020 10:37:46 -0400 Subject: [PATCH 20/20] Move whats new entry to 0.16 | fix PEP8 issue in test_dataarray --- doc/whats-new.rst | 4 ++-- xarray/tests/test_dataarray.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3f1a3c6e993..2a9ee846b28 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,8 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) + By `Pascal Bourgault `_. - Control over attributes of result in :py:func:`merge`, :py:func:`concat`, :py:func:`combine_by_coords` and :py:func:`combine_nested` using combine_attrs keyword argument. (:issue:`3865`, :pull:`3877`) @@ -212,8 +214,6 @@ Breaking changes New Features ~~~~~~~~~~~~ -- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) - By `Pascal Bourgault `_. - Implement :py:meth:`DataArray.pad` and :py:meth:`Dataset.pad`. (:issue:`2605`, :pull:`3596`). By `Mark Boer `_. - :py:meth:`DataArray.sel` and :py:meth:`Dataset.sel` now support :py:class:`pandas.CategoricalIndex`. (:issue:`3669`) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 6416ca877bc..e23ff2f7e31 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4240,6 +4240,7 @@ def test_polyfit(self, use_dask, use_datetime): assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) assert out.x_matrix_rank == 3 np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0]) + def test_pad_constant(self): ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) actual = ar.pad(dim_0=(1, 3))