diff --git a/doc/api.rst b/doc/api.rst index b9c3e3bdd33..216f47f988f 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -30,6 +30,7 @@ Top-level functions zeros_like ones_like dot + polyval map_blocks show_versions set_options @@ -172,6 +173,7 @@ Computation Dataset.quantile Dataset.differentiate Dataset.integrate + Dataset.polyfit **Aggregation**: :py:attr:`~Dataset.all` @@ -352,6 +354,7 @@ Computation DataArray.quantile DataArray.differentiate DataArray.integrate + DataArray.polyfit DataArray.str **Aggregation**: diff --git a/doc/computation.rst b/doc/computation.rst index 5309f27e9b6..4b8014c4782 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -401,6 +401,32 @@ trapezoidal rule using their coordinates, and integration along multidimensional coordinate are not supported. +.. _compute.polyfit: + +Fitting polynomials +=================== + +Xarray objects provide an interface for performing linear or polynomial regressions +using the least-squares method. :py:meth:`~xarray.DataArray.polyfit` computes the +best fitting coefficients along a given dimension and for a given order, + +.. ipython:: python + + x = xr.DataArray(np.arange(10), dims=['x'], name='x') + a = xr.DataArray(3 + 4 * x, dims=['x'], coords={'x': x}) + out = a.polyfit(dim='x', deg=1, full=True) + out + +The method outputs a dataset containing the coefficients (and more if `full=True`). +The inverse operation is done with :py:meth:`~xarray.polyval`, + +.. ipython:: python + + xr.polyval(coord=x, coeffs=out.polyfit_coefficients) + +.. note:: + These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`. + .. _compute.broadcasting: Broadcasting by dimension name diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4515f552812..2a9ee846b28 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,8 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) + By `Pascal Bourgault `_. - Control over attributes of result in :py:func:`merge`, :py:func:`concat`, :py:func:`combine_by_coords` and :py:func:`combine_nested` using combine_attrs keyword argument. (:issue:`3865`, :pull:`3877`) diff --git a/xarray/__init__.py b/xarray/__init__.py index 331d8ecb09a..0fead57e5fb 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -17,7 +17,7 @@ from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, dot, where +from .core.computation import apply_ufunc, dot, polyval, where from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset @@ -65,6 +65,7 @@ "open_mfdataset", "open_rasterio", "open_zarr", + "polyval", "register_dataarray_accessor", "register_dataset_accessor", "save_mfdataset", diff --git a/xarray/core/computation.py b/xarray/core/computation.py index f2941a3d0ba..13bf6248331 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1306,3 +1306,35 @@ def where(cond, x, y): dataset_join="exact", dask="allowed", ) + + +def polyval(coord, coeffs, degree_dim="degree"): + """Evaluate a polynomial at specific values + + Parameters + ---------- + coord : DataArray + The 1D coordinate along which to evaluate the polynomial. + coeffs : DataArray + Coefficients of the polynomials. + degree_dim : str, default "degree" + Name of the polynomial degree dimension in `coeffs`. + + See also + -------- + xarray.DataArray.polyfit + numpy.polyval + """ + from .dataarray import DataArray + from .missing import get_clean_interp_index + + x = get_clean_interp_index(coord, coord.name) + + deg_coord = coeffs[degree_dim] + + lhs = DataArray( + np.vander(x, int(deg_coord.max()) + 1), + dims=(coord.name, degree_dim), + coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)[::-1]}, + ) + return (lhs * coeffs).sum(degree_dim) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 37f261cc3ad..87f646352eb 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -95,3 +95,30 @@ def func(x, window, axis=-1): # crop boundary. index = (slice(None),) * axis + (slice(drop_size, drop_size + orig_shape[axis]),) return out[index] + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + import dask.array as da + + lhs_da = da.from_array(lhs, chunks=(rhs.chunks[0], lhs.shape[1])) + if skipna: + added_dim = rhs.ndim == 1 + if added_dim: + rhs = rhs.reshape(rhs.shape[0], 1) + results = da.apply_along_axis( + nputils._nanpolyfit_1d, + 0, + rhs, + lhs_da, + dtype=float, + shape=(lhs.shape[1] + 1,), + rcond=rcond, + ) + coeffs = results[:-1, ...] + residuals = results[-1, ...] + if added_dim: + coeffs = coeffs.reshape(coeffs.shape[0]) + residuals = residuals.reshape(residuals.shape[0]) + else: + coeffs, residuals, _, _ = da.linalg.lstsq(lhs_da, rhs) + return coeffs, residuals diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 232fb86144e..070886cfc34 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3275,6 +3275,68 @@ def map_blocks( return map_blocks(func, self, args, kwargs) + def polyfit( + self, + dim: Hashable, + deg: int, + skipna: bool = None, + rcond: float = None, + w: Union[Hashable, Any] = None, + full: bool = False, + cov: bool = False, + ): + """ + Least squares polynomial fit. + + This replicates the behaviour of `numpy.polyfit` but differs by skipping + invalid values when `skipna = True`. + + Parameters + ---------- + dim : hashable + Coordinate along which to fit the polynomials. + deg : int + Degree of the fitting polynomial. + skipna : bool, optional + If True, removes all invalid values before fitting each 1D slices of the array. + Default is True if data is stored in a dask.array or if there is any + invalid values, False otherwise. + rcond : float, optional + Relative condition number to the fit. + w : Union[Hashable, Any], optional + Weights to apply to the y-coordinate of the sample points. + Can be an array-like object or the name of a coordinate in the dataset. + full : bool, optional + Whether to return the residuals, matrix rank and singular values in addition + to the coefficients. + cov : Union[bool, str], optional + Whether to return to the covariance matrix in addition to the coefficients. + The matrix is not scaled if `cov='unscaled'`. + + Returns + ------- + polyfit_results : Dataset + A single dataset which contains: + + polyfit_coefficients + The coefficients of the best fit. + polyfit_residuals + The residuals of the least-square computation (only included if `full=True`) + [dim]_matrix_rank + The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_value + The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + polyfit_covariance + The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + + See also + -------- + numpy.polyfit + """ + return self._to_temp_dataset().polyfit( + dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov + ) + def pad( self, pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6f96e4f469c..c49694b1fc0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -76,6 +76,7 @@ merge_coordinates_without_align, merge_data_and_coords, ) +from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs from .pycompat import dask_array_type from .utils import ( @@ -5748,6 +5749,184 @@ def map_blocks( return map_blocks(func, self, args, kwargs) + def polyfit( + self, + dim: Hashable, + deg: int, + skipna: bool = None, + rcond: float = None, + w: Union[Hashable, Any] = None, + full: bool = False, + cov: Union[bool, str] = False, + ): + """ + Least squares polynomial fit. + + This replicates the behaviour of `numpy.polyfit` but differs by skipping + invalid values when `skipna = True`. + + Parameters + ---------- + dim : hashable + Coordinate along which to fit the polynomials. + deg : int + Degree of the fitting polynomial. + skipna : bool, optional + If True, removes all invalid values before fitting each 1D slices of the array. + Default is True if data is stored in a dask.array or if there is any + invalid values, False otherwise. + rcond : float, optional + Relative condition number to the fit. + w : Union[Hashable, Any], optional + Weights to apply to the y-coordinate of the sample points. + Can be an array-like object or the name of a coordinate in the dataset. + full : bool, optional + Whether to return the residuals, matrix rank and singular values in addition + to the coefficients. + cov : Union[bool, str], optional + Whether to return to the covariance matrix in addition to the coefficients. + The matrix is not scaled if `cov='unscaled'`. + + + Returns + ------- + polyfit_results : Dataset + A single dataset which contains (for each "var" in the input dataset): + + [var]_polyfit_coefficients + The coefficients of the best fit for each variable in this dataset. + [var]_polyfit_residuals + The residuals of the least-square computation for each variable (only included if `full=True`) + [dim]_matrix_rank + The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_singular_values + The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [var]_polyfit_covariance + The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + + See also + -------- + numpy.polyfit + """ + variables = {} + skipna_da = skipna + + x = get_clean_interp_index(self, dim) + xname = "{}_".format(self[dim].name) + order = int(deg) + 1 + lhs = np.vander(x, order) + + if rcond is None: + rcond = x.shape[0] * np.core.finfo(x.dtype).eps + + # Weights: + if w is not None: + if isinstance(w, Hashable): + w = self.coords[w] + w = np.asarray(w) + if w.ndim != 1: + raise TypeError("Expected a 1-d array for weights.") + if w.shape[0] != lhs.shape[0]: + raise TypeError("Expected w and {} to have the same length".format(dim)) + lhs *= w[:, np.newaxis] + + # Scaling + scale = np.sqrt((lhs * lhs).sum(axis=0)) + lhs /= scale + + degree_dim = utils.get_temp_dimname(self.dims, "degree") + + rank = np.linalg.matrix_rank(lhs) + if rank != order and not full: + warnings.warn( + "Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4 + ) + + if full: + rank = xr.DataArray(rank, name=xname + "matrix_rank") + variables[rank.name] = rank + sing = np.linalg.svd(lhs, compute_uv=False) + sing = xr.DataArray( + sing, + dims=(degree_dim,), + coords={degree_dim: np.arange(order)[::-1]}, + name=xname + "singular_values", + ) + variables[sing.name] = sing + + for name, da in self.data_vars.items(): + if dim not in da.dims: + continue + + if skipna is None: + if isinstance(da.data, dask_array_type): + skipna_da = True + else: + skipna_da = np.any(da.isnull()) + + dims_to_stack = [dimname for dimname in da.dims if dimname != dim] + stacked_coords = {} + if dims_to_stack: + stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") + rhs = da.transpose(dim, *dims_to_stack).stack( + {stacked_dim: dims_to_stack} + ) + stacked_coords = {stacked_dim: rhs[stacked_dim]} + scale_da = scale[:, np.newaxis] + else: + rhs = da + scale_da = scale + + if w is not None: + rhs *= w[:, np.newaxis] + + coeffs, residuals = duck_array_ops.least_squares( + lhs, rhs.data, rcond=rcond, skipna=skipna_da + ) + + if isinstance(name, str): + name = "{}_".format(name) + else: + # Thus a ReprObject => polyfit was called on a DataArray + name = "" + + coeffs = xr.DataArray( + coeffs / scale_da, + dims=[degree_dim] + list(stacked_coords.keys()), + coords={degree_dim: np.arange(order)[::-1], **stacked_coords}, + name=name + "polyfit_coefficients", + ) + if dims_to_stack: + coeffs = coeffs.unstack(stacked_dim) + variables[coeffs.name] = coeffs + + if full or (cov is True): + residuals = xr.DataArray( + residuals if dims_to_stack else residuals.squeeze(), + dims=list(stacked_coords.keys()), + coords=stacked_coords, + name=name + "polyfit_residuals", + ) + if dims_to_stack: + residuals = residuals.unstack(stacked_dim) + variables[residuals.name] = residuals + + if cov: + Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) + Vbase /= np.outer(scale, scale) + if cov == "unscaled": + fac = 1 + else: + if x.shape[0] <= order: + raise ValueError( + "The number of data points must exceed order to scale the covariance matrix." + ) + fac = residuals / (x.shape[0] - order) + covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j"),) * fac + variables[name + "polyfit_covariance"] = covariance + + return Dataset(data_vars=variables, attrs=self.attrs.copy()) + def pad( self, pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ff2d0af63ed..4047a1e68e1 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -597,3 +597,12 @@ def rolling_window(array, axis, window, center, fill_value): return dask_array_ops.rolling_window(array, axis, window, center, fill_value) else: # np.ndarray return nputils.rolling_window(array, axis, window, center, fill_value) + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + """Return the coefficients and residuals of a least-squares fit. + """ + if isinstance(rhs, dask_array_type): + return dask_array_ops.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) + else: + return nputils.least_squares(lhs, rhs, rcond=rcond, skipna=skipna) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 5dd8219ebca..fa6df63e0ea 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -220,6 +220,39 @@ def f(values, axis=None, **kwargs): return f +def _nanpolyfit_1d(arr, x, rcond=None): + out = np.full((x.shape[1] + 1,), np.nan) + mask = np.isnan(arr) + if not np.all(mask): + out[:-1], out[-1], _, _ = np.linalg.lstsq(x[~mask, :], arr[~mask], rcond=rcond) + return out + + +def least_squares(lhs, rhs, rcond=None, skipna=False): + if skipna: + added_dim = rhs.ndim == 1 + if added_dim: + rhs = rhs.reshape(rhs.shape[0], 1) + nan_cols = np.any(np.isnan(rhs), axis=0) + out = np.empty((lhs.shape[1] + 1, rhs.shape[1])) + if np.any(nan_cols): + out[:, nan_cols] = np.apply_along_axis( + _nanpolyfit_1d, 0, rhs[:, nan_cols], lhs + ) + if np.any(~nan_cols): + out[:-1, ~nan_cols], out[-1, ~nan_cols], _, _ = np.linalg.lstsq( + lhs, rhs[:, ~nan_cols], rcond=rcond + ) + coeffs = out[:-1, :] + residuals = out[-1, :] + if added_dim: + coeffs = coeffs.reshape(coeffs.shape[0]) + residuals = residuals.reshape(residuals.shape[0]) + else: + coeffs, residuals, _, _ = np.linalg.lstsq(lhs, rhs, rcond=rcond) + return coeffs, residuals + + nanmin = _create_bottleneck_method("nanmin") nanmax = _create_bottleneck_method("nanmax") nanmean = _create_bottleneck_method("nanmean") diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 369903552ad..4eed464d2dc 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1120,3 +1120,35 @@ def test_where(): actual = xr.where(cond, 1, 0) expected = xr.DataArray([1, 0], dims="x") assert_identical(expected, actual) + + +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.parametrize("use_datetime", [True, False]) +def test_polyval(use_dask, use_datetime): + if use_dask and not has_dask: + pytest.skip("requires dask") + + if use_datetime: + xcoord = xr.DataArray( + pd.date_range("2000-01-01", freq="D", periods=10), dims=("x",), name="x" + ) + x = xr.core.missing.get_clean_interp_index(xcoord, "x") + else: + xcoord = x = np.arange(10) + + da = xr.DataArray( + np.stack((1.0 + x + 2.0 * x ** 2, 1.0 + 2.0 * x + 3.0 * x ** 2)), + dims=("d", "x"), + coords={"x": xcoord, "d": [0, 1]}, + ) + coeffs = xr.DataArray( + [[2, 1, 1], [3, 2, 1]], + dims=("d", "degree"), + coords={"d": [0, 1], "degree": [2, 1, 0]}, + ) + if use_dask: + coeffs = coeffs.chunk({"d": 2}) + + da_pv = xr.polyval(da.x, coeffs) + + xr.testing.assert_allclose(da, da_pv.T) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4f19dc2a9cf..e23ff2f7e31 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -23,6 +23,7 @@ assert_array_equal, assert_equal, assert_identical, + has_dask, raises_regex, requires_bottleneck, requires_dask, @@ -4191,6 +4192,55 @@ def test_rank(self): y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=("z",)) assert_equal(y.rank("z", pct=True), y) + @pytest.mark.parametrize("use_dask", [True, False]) + @pytest.mark.parametrize("use_datetime", [True, False]) + def test_polyfit(self, use_dask, use_datetime): + if use_dask and not has_dask: + pytest.skip("requires dask") + xcoord = xr.DataArray( + pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" + ) + x = xr.core.missing.get_clean_interp_index(xcoord, "x") + if not use_datetime: + xcoord = x + + da_raw = DataArray( + np.stack( + (10 + 1e-15 * x + 2e-28 * x ** 2, 30 + 2e-14 * x + 1e-29 * x ** 2) + ), + dims=("d", "x"), + coords={"x": xcoord, "d": [0, 1]}, + ) + + if use_dask: + da = da_raw.chunk({"d": 1}) + else: + da = da_raw + + out = da.polyfit("x", 2) + expected = DataArray( + [[2e-28, 1e-15, 10], [1e-29, 2e-14, 30]], + dims=("d", "degree"), + coords={"degree": [2, 1, 0], "d": [0, 1]}, + ).T + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + + # With NaN + da_raw[0, 1] = np.nan + if use_dask: + da = da_raw.chunk({"d": 1}) + else: + da = da_raw + out = da.polyfit("x", 2, skipna=True, cov=True) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + assert "polyfit_covariance" in out + + # Skipna + Full output + out = da.polyfit("x", 2, skipna=True, full=True) + assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) + assert out.x_matrix_rank == 3 + np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0]) + def test_pad_constant(self): ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) actual = ar.pad(dim_0=(1, 3)) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 20b814a25c7..02698253e5d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5499,6 +5499,19 @@ def test_ipython_key_completion(self): ds.data_vars[item] # should not raise assert sorted(actual) == sorted(expected) + def test_polyfit_output(self): + ds = create_test_data(seed=1) + + out = ds.polyfit("dim2", 2, full=False) + assert "var1_polyfit_coefficients" in out + + out = ds.polyfit("dim1", 2, full=True) + assert "var1_polyfit_coefficients" in out + assert "dim1_matrix_rank" in out + + out = ds.polyfit("time", 2) + assert len(out.data_vars) == 0 + def test_pad(self): ds = create_test_data(seed=1) padded = ds.pad(dim2=(1, 1), constant_values=42) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 157cd16cba6..e61881cfce3 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -16,6 +16,7 @@ first, gradient, last, + least_squares, mean, np_timedelta64_to_float, pd_timedelta_to_float, @@ -761,3 +762,20 @@ def test_timedelta_to_numeric(td): out = timedelta_to_numeric(td, "ns") np.testing.assert_allclose(out, 86400 * 1e9) assert isinstance(out, float) + + +@pytest.mark.parametrize("use_dask", [True, False]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_least_squares(use_dask, skipna): + if use_dask and not has_dask: + pytest.skip("requires dask") + lhs = np.array([[1, 2], [1, 2], [3, 2]]) + rhs = DataArray(np.array([3, 5, 7]), dims=("y",)) + + if use_dask: + rhs = rhs.chunk({"y": 1}) + + coeffs, residuals = least_squares(lhs, rhs.data, skipna=skipna) + + np.testing.assert_allclose(coeffs, [1.5, 1.25]) + np.testing.assert_allclose(residuals, [2.0])