diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 437f53b1a91..cc9517a98ba 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -379,7 +379,6 @@ Variable.min Variable.no_conflicts Variable.notnull - Variable.pad_with_fill_value Variable.prod Variable.quantile Variable.rank @@ -453,7 +452,6 @@ IndexVariable.min IndexVariable.no_conflicts IndexVariable.notnull - IndexVariable.pad_with_fill_value IndexVariable.prod IndexVariable.quantile IndexVariable.rank diff --git a/doc/api.rst b/doc/api.rst index 4492d882355..b3b6aa9139d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -220,6 +220,7 @@ Reshaping and reorganizing Dataset.to_stacked_array Dataset.shift Dataset.roll + Dataset.pad Dataset.sortby Dataset.broadcast_like @@ -399,6 +400,7 @@ Reshaping and reorganizing DataArray.to_unstacked_dataset DataArray.shift DataArray.roll + DataArray.pad DataArray.sortby DataArray.broadcast_like diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 24120270444..fa71be23db9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -125,6 +125,8 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Implement :py:meth:`DataArray.pad` and :py:meth:`Dataset.pad`. (:issue:`2605`, :pull:`3596`). + By `Mark Boer `_. - :py:meth:`DataArray.sel` and :py:meth:`Dataset.sel` now support :py:class:`pandas.CategoricalIndex`. (:issue:`3669`) By `Keisuke Fujii `_. - Support using an existing, opened h5netcdf ``File`` with diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py index 05f750a1355..94c50d90e84 100644 --- a/xarray/core/dask_array_compat.py +++ b/xarray/core/dask_array_compat.py @@ -1,3 +1,4 @@ +import warnings from distutils.version import LooseVersion from typing import Iterable @@ -99,6 +100,52 @@ def meta_from_array(x, ndim=None, dtype=None): return meta +def _validate_pad_output_shape(input_shape, pad_width, output_shape): + """ Validates the output shape of dask.array.pad, raising a RuntimeError if they do not match. + In the current versions of dask (2.2/2.4), dask.array.pad with mode='reflect' sometimes returns + an invalid shape. + """ + isint = lambda i: isinstance(i, int) + + if isint(pad_width): + pass + elif len(pad_width) == 2 and all(map(isint, pad_width)): + pad_width = sum(pad_width) + elif ( + len(pad_width) == len(input_shape) + and all(map(lambda x: len(x) == 2, pad_width)) + and all((isint(i) for p in pad_width for i in p)) + ): + pad_width = np.sum(pad_width, axis=1) + else: + # unreachable: dask.array.pad should already have thrown an error + raise ValueError("Invalid value for `pad_width`") + + if not np.array_equal(np.array(input_shape) + pad_width, output_shape): + raise RuntimeError( + "There seems to be something wrong with the shape of the output of dask.array.pad, " + "try upgrading Dask, use a different pad mode e.g. mode='constant' or first convert " + "your DataArray/Dataset to one backed by a numpy array by calling the `compute()` method." + "See: https://github.com/dask/dask/issues/5303" + ) + + +def pad(array, pad_width, mode="constant", **kwargs): + padded = da.pad(array, pad_width, mode=mode, **kwargs) + # workaround for inconsistency between numpy and dask: https://github.com/dask/dask/issues/5303 + if mode == "mean" and issubclass(array.dtype.type, np.integer): + warnings.warn( + 'dask.array.pad(mode="mean") converts integers to floats. xarray converts ' + "these floats back to integers to keep the interface consistent. There is a chance that " + "this introduces rounding errors. If you wish to keep the values as floats, first change " + "the dtype to a float before calling pad.", + UserWarning, + ) + return da.round(padded).astype(array.dtype) + _validate_pad_output_shape(array.shape, pad_width, padded.shape) + return padded + + if LooseVersion(dask_version) >= LooseVersion("2.8.1"): median = da.median else: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4e80ef222c2..4d9993d7383 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3239,6 +3239,174 @@ def map_blocks( return map_blocks(func, self, args, kwargs) + def pad( + self, + pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, + mode: str = "constant", + stat_length: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + constant_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + end_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + reflect_type: str = None, + **pad_width_kwargs: Any, + ) -> "DataArray": + """Pad this array along one or more dimensions. + + .. warning:: + This function is experimental and its behaviour is likely to change + especially regarding padding of dimension coordinates (or IndexVariables). + + When using one of the modes ("edge", "reflect", "symmetric", "wrap"), + coordinates will be padded with the same mode, otherwise coordinates + are padded using the "constant" mode with fill_value dtypes.NA. + + Parameters + ---------- + pad_width : Mapping with the form of {dim: (pad_before, pad_after)} + Number of values padded along each dimension. + {dim: pad} is a shortcut for pad_before = pad_after = pad + mode : str + One of the following string values (taken from numpy docs) + + 'constant' (default) + Pads with a constant value. + 'edge' + Pads with the edge values of array. + 'linear_ramp' + Pads with the linear ramp between end_value and the + array edge value. + 'maximum' + Pads with the maximum value of all or part of the + vector along each axis. + 'mean' + Pads with the mean value of all or part of the + vector along each axis. + 'median' + Pads with the median value of all or part of the + vector along each axis. + 'minimum' + Pads with the minimum value of all or part of the + vector along each axis. + 'reflect' + Pads with the reflection of the vector mirrored on + the first and last values of the vector along each + axis. + 'symmetric' + Pads with the reflection of the vector mirrored + along the edge of the array. + 'wrap' + Pads with the wrap of the vector along the axis. + The first values are used to pad the end and the + end values are used to pad the beginning. + stat_length : int, tuple or mapping of the form {dim: tuple} + Used in 'maximum', 'mean', 'median', and 'minimum'. Number of + values at edge of each axis used to calculate the statistic value. + {dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)} unique + statistic lengths along each dimension. + ((before, after),) yields same before and after statistic lengths + for each dimension. + (stat_length,) or int is a shortcut for before = after = statistic + length for all axes. + Default is ``None``, to use the entire axis. + constant_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'constant'. The values to set the padded values for each + axis. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + pad constants along each dimension. + ``((before, after),)`` yields same before and after constants for each + dimension. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all dimensions. + Default is 0. + end_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'linear_ramp'. The values used for the ending value of the + linear_ramp and that will form the edge of the padded array. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + end values along each dimension. + ``((before, after),)`` yields same before and after end values for each + axis. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all axes. + Default is 0. + reflect_type : {'even', 'odd'}, optional + Used in 'reflect', and 'symmetric'. The 'even' style is the + default with an unaltered reflection around the edge value. For + the 'odd' style, the extended part of the array is created by + subtracting the reflected values from two times the edge value. + **pad_width_kwargs: + The keyword arguments form of ``pad_width``. + One of ``pad_width`` or ``pad_width_kwargs`` must be provided. + + Returns + ------- + padded : DataArray + DataArray with the padded coordinates and data. + + See also + -------- + DataArray.shift, DataArray.roll, DataArray.bfill, DataArray.ffill, numpy.pad, dask.array.pad + + Notes + ----- + By default when ``mode="constant"`` and ``constant_values=None``, integer types will be + promoted to ``float`` and padded with ``np.nan``. To avoid type promotion + specify ``constant_values=np.nan`` + + Examples + -------- + + >>> arr = xr.DataArray([5, 6, 7], coords=[("x", [0,1,2])]) + >>> arr.pad(x=(1,2), constant_values=0) + + array([0, 5, 6, 7, 0, 0]) + Coordinates: + * x (x) float64 nan 0.0 1.0 2.0 nan nan + + >>> da = xr.DataArray([[0,1,2,3], [10,11,12,13]], + dims=["x", "y"], + coords={"x": [0,1], "y": [10, 20 ,30, 40], "z": ("x", [100, 200])} + ) + >>> da.pad(x=1) + + array([[nan, nan, nan, nan], + [ 0., 1., 2., 3.], + [10., 11., 12., 13.], + [nan, nan, nan, nan]]) + Coordinates: + * x (x) float64 nan 0.0 1.0 nan + * y (y) int64 10 20 30 40 + z (x) float64 nan 100.0 200.0 nan + >>> da.pad(x=1, constant_values=np.nan) + + array([[-9223372036854775808, -9223372036854775808, -9223372036854775808, + -9223372036854775808], + [ 0, 1, 2, + 3], + [ 10, 11, 12, + 13], + [-9223372036854775808, -9223372036854775808, -9223372036854775808, + -9223372036854775808]]) + Coordinates: + * x (x) float64 nan 0.0 1.0 nan + * y (y) int64 10 20 30 40 + z (x) float64 nan 100.0 200.0 nan + """ + ds = self._to_temp_dataset().pad( + pad_width=pad_width, + mode=mode, + stat_length=stat_length, + constant_values=constant_values, + end_values=end_values, + reflect_type=reflect_type, + **pad_width_kwargs, + ) + return self._from_temp_dataset(ds) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = property(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 52940e98b27..937bebc2bc8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5699,5 +5699,171 @@ def map_blocks( return map_blocks(func, self, args, kwargs) + def pad( + self, + pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, + mode: str = "constant", + stat_length: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + constant_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + end_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + reflect_type: str = None, + **pad_width_kwargs: Any, + ) -> "Dataset": + """Pad this dataset along one or more dimensions. + + .. warning:: + This function is experimental and its behaviour is likely to change + especially regarding padding of dimension coordinates (or IndexVariables). + + When using one of the modes ("edge", "reflect", "symmetric", "wrap"), + coordinates will be padded with the same mode, otherwise coordinates + are padded using the "constant" mode with fill_value dtypes.NA. + + Parameters + ---------- + pad_width : Mapping with the form of {dim: (pad_before, pad_after)} + Number of values padded along each dimension. + {dim: pad} is a shortcut for pad_before = pad_after = pad + mode : str + One of the following string values (taken from numpy docs). + + 'constant' (default) + Pads with a constant value. + 'edge' + Pads with the edge values of array. + 'linear_ramp' + Pads with the linear ramp between end_value and the + array edge value. + 'maximum' + Pads with the maximum value of all or part of the + vector along each axis. + 'mean' + Pads with the mean value of all or part of the + vector along each axis. + 'median' + Pads with the median value of all or part of the + vector along each axis. + 'minimum' + Pads with the minimum value of all or part of the + vector along each axis. + 'reflect' + Pads with the reflection of the vector mirrored on + the first and last values of the vector along each + axis. + 'symmetric' + Pads with the reflection of the vector mirrored + along the edge of the array. + 'wrap' + Pads with the wrap of the vector along the axis. + The first values are used to pad the end and the + end values are used to pad the beginning. + stat_length : int, tuple or mapping of the form {dim: tuple} + Used in 'maximum', 'mean', 'median', and 'minimum'. Number of + values at edge of each axis used to calculate the statistic value. + {dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)} unique + statistic lengths along each dimension. + ((before, after),) yields same before and after statistic lengths + for each dimension. + (stat_length,) or int is a shortcut for before = after = statistic + length for all axes. + Default is ``None``, to use the entire axis. + constant_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'constant'. The values to set the padded values for each + axis. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + pad constants along each dimension. + ``((before, after),)`` yields same before and after constants for each + dimension. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all dimensions. + Default is 0. + end_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'linear_ramp'. The values used for the ending value of the + linear_ramp and that will form the edge of the padded array. + ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique + end values along each dimension. + ``((before, after),)`` yields same before and after end values for each + axis. + ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for + all axes. + Default is 0. + reflect_type : {'even', 'odd'}, optional + Used in 'reflect', and 'symmetric'. The 'even' style is the + default with an unaltered reflection around the edge value. For + the 'odd' style, the extended part of the array is created by + subtracting the reflected values from two times the edge value. + **pad_width_kwargs: + The keyword arguments form of ``pad_width``. + One of ``pad_width`` or ``pad_width_kwargs`` must be provided. + + Returns + ------- + padded : Dataset + Dataset with the padded coordinates and data. + + See also + -------- + Dataset.shift, Dataset.roll, Dataset.bfill, Dataset.ffill, numpy.pad, dask.array.pad + + Notes + ----- + By default when ``mode="constant"`` and ``constant_values=None``, integer types will be + promoted to ``float`` and padded with ``np.nan``. To avoid type promotion + specify ``constant_values=np.nan`` + + Examples + -------- + + >>> ds = xr.Dataset({'foo': ('x', range(5))}) + >>> ds.pad(x=(1,2)) + + Dimensions: (x: 8) + Dimensions without coordinates: x + Data variables: + foo (x) float64 nan 0.0 1.0 2.0 3.0 4.0 nan nan + """ + pad_width = either_dict_or_kwargs(pad_width, pad_width_kwargs, "pad") + + if mode in ("edge", "reflect", "symmetric", "wrap"): + coord_pad_mode = mode + coord_pad_options = { + "stat_length": stat_length, + "constant_values": constant_values, + "end_values": end_values, + "reflect_type": reflect_type, + } + else: + coord_pad_mode = "constant" + coord_pad_options = {} + + variables = {} + for name, var in self.variables.items(): + var_pad_width = {k: v for k, v in pad_width.items() if k in var.dims} + if not var_pad_width: + variables[name] = var + elif name in self.data_vars: + variables[name] = var.pad( + pad_width=var_pad_width, + mode=mode, + stat_length=stat_length, + constant_values=constant_values, + end_values=end_values, + reflect_type=reflect_type, + ) + else: + variables[name] = var.pad( + pad_width=var_pad_width, + mode=coord_pad_mode, + **coord_pad_options, # type: ignore + ) + + return self._replace_vars_and_dims(variables) + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 6d0abe9a6fc..ff2d0af63ed 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -114,7 +114,7 @@ def notnull(data): isin = _dask_or_eager_func("isin", array_args=slice(2)) take = _dask_or_eager_func("take") broadcast_to = _dask_or_eager_func("broadcast_to") -pad = _dask_or_eager_func("pad") +pad = _dask_or_eager_func("pad", dask_module=dask_array_compat) _concatenate = _dask_or_eager_func("concatenate", list_of_args=True) _stack = _dask_or_eager_func("stack", list_of_args=True) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 61178cfb15f..6e5cf187d5a 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -348,7 +348,7 @@ def _bottleneck_reduce(self, func, **kwargs): else: shift = (-self.window // 2) + 1 valid = (slice(None),) * axis + (slice(-shift, None),) - padded = padded.pad_with_fill_value({self.dim: (0, -shift)}) + padded = padded.pad({self.dim: (0, -shift)}, mode="constant") if isinstance(padded.data, dask_array_type): raise AssertionError("should not be reachable") diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 62f9fde6a2e..ce27b118180 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,11 +1,12 @@ import copy import functools import itertools +import numbers import warnings from collections import defaultdict from datetime import timedelta from distutils.version import LooseVersion -from typing import Any, Dict, Hashable, Mapping, TypeVar, Union +from typing import Any, Dict, Hashable, Mapping, Tuple, TypeVar, Union import numpy as np import pandas as pd @@ -32,12 +33,6 @@ infix_dims, ) -try: - import dask.array as da -except ImportError: - pass - - NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( indexing.ExplicitlyIndexed, pd.Index, @@ -1150,66 +1145,114 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs): result = result._shift_one_dim(dim, count, fill_value=fill_value) return result - def pad_with_fill_value( - self, pad_widths=None, fill_value=dtypes.NA, **pad_widths_kwargs + def _pad_options_dim_to_index( + self, + pad_option: Mapping[Hashable, Union[int, Tuple[int, int]]], + fill_with_shape=False, + ): + if fill_with_shape: + return [ + (n, n) if d not in pad_option else pad_option[d] + for d, n in zip(self.dims, self.data.shape) + ] + return [(0, 0) if d not in pad_option else pad_option[d] for d in self.dims] + + def pad( + self, + pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None, + mode: str = "constant", + stat_length: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + constant_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + end_values: Union[ + int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]] + ] = None, + reflect_type: str = None, + **pad_width_kwargs: Any, ): """ - Return a new Variable with paddings. + Return a new Variable with padded data. Parameters ---------- - pad_width: Mapping of the form {dim: (before, after)} - Number of values padded to the edges of each dimension. - **pad_widths_kwargs: - Keyword argument for pad_widths + pad_width: Mapping with the form of {dim: (pad_before, pad_after)} + Number of values padded along each dimension. + {dim: pad} is a shortcut for pad_before = pad_after = pad + mode: (str) + See numpy / Dask docs + stat_length : int, tuple or mapping of the form {dim: tuple} + Used in 'maximum', 'mean', 'median', and 'minimum'. Number of + values at edge of each axis used to calculate the statistic value. + constant_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'constant'. The values to set the padded values for each + axis. + end_values : scalar, tuple or mapping of the form {dim: tuple} + Used in 'linear_ramp'. The values used for the ending value of the + linear_ramp and that will form the edge of the padded array. + reflect_type : {'even', 'odd'}, optional + Used in 'reflect', and 'symmetric'. The 'even' style is the + default with an unaltered reflection around the edge value. For + the 'odd' style, the extended part of the array is created by + subtracting the reflected values from two times the edge value. + **pad_width_kwargs: + One of pad_width or pad_width_kwargs must be provided. + + Returns + ------- + padded : Variable + Variable with the same dimensions and attributes but padded data. """ - pad_widths = either_dict_or_kwargs(pad_widths, pad_widths_kwargs, "pad") + pad_width = either_dict_or_kwargs(pad_width, pad_width_kwargs, "pad") - if fill_value is dtypes.NA: - dtype, fill_value = dtypes.maybe_promote(self.dtype) + # change default behaviour of pad with mode constant + if mode == "constant" and ( + constant_values is None or constant_values is dtypes.NA + ): + dtype, constant_values = dtypes.maybe_promote(self.dtype) else: dtype = self.dtype - if isinstance(self.data, dask_array_type): - array = self.data - - # Dask does not yet support pad. We manually implement it. - # https://github.com/dask/dask/issues/1926 - for d, pad in pad_widths.items(): - axis = self.get_axis_num(d) - before_shape = list(array.shape) - before_shape[axis] = pad[0] - before_chunks = list(array.chunks) - before_chunks[axis] = (pad[0],) - after_shape = list(array.shape) - after_shape[axis] = pad[1] - after_chunks = list(array.chunks) - after_chunks[axis] = (pad[1],) - - arrays = [] - if pad[0] > 0: - arrays.append( - da.full( - before_shape, fill_value, dtype=dtype, chunks=before_chunks - ) - ) - arrays.append(array) - if pad[1] > 0: - arrays.append( - da.full( - after_shape, fill_value, dtype=dtype, chunks=after_chunks - ) - ) - if len(arrays) > 1: - array = da.concatenate(arrays, axis=axis) - else: - pads = [(0, 0) if d not in pad_widths else pad_widths[d] for d in self.dims] - array = np.pad( - self.data.astype(dtype, copy=False), - pads, - mode="constant", - constant_values=fill_value, + # create pad_options_kwargs, numpy requires only relevant kwargs to be nonempty + if isinstance(stat_length, dict): + stat_length = self._pad_options_dim_to_index( + stat_length, fill_with_shape=True ) + if isinstance(constant_values, dict): + constant_values = self._pad_options_dim_to_index(constant_values) + if isinstance(end_values, dict): + end_values = self._pad_options_dim_to_index(end_values) + + # workaround for bug in Dask's default value of stat_length https://github.com/dask/dask/issues/5303 + if stat_length is None and mode in ["maximum", "mean", "median", "minimum"]: + stat_length = [(n, n) for n in self.data.shape] # type: ignore + + # change integer values to a tuple of two of those values and change pad_width to index + for k, v in pad_width.items(): + if isinstance(v, numbers.Number): + pad_width[k] = (v, v) + pad_width_by_index = self._pad_options_dim_to_index(pad_width) + + # create pad_options_kwargs, numpy/dask requires only relevant kwargs to be nonempty + pad_option_kwargs = {} + if stat_length is not None: + pad_option_kwargs["stat_length"] = stat_length + if constant_values is not None: + pad_option_kwargs["constant_values"] = constant_values + if end_values is not None: + pad_option_kwargs["end_values"] = end_values + if reflect_type is not None: + pad_option_kwargs["reflect_type"] = reflect_type # type: ignore + + array = duck_array_ops.pad( + self.data.astype(dtype, copy=False), + pad_width_by_index, + mode=mode, + **pad_option_kwargs, + ) + return type(self)(self.dims, array) def _roll_one_dim(self, dim, count): @@ -1926,10 +1969,10 @@ def _coarsen_reshape(self, windows, boundary, side): if pad < 0: pad += window if side[d] == "left": - pad_widths = {d: (0, pad)} + pad_width = {d: (0, pad)} else: - pad_widths = {d: (pad, 0)} - variable = variable.pad_with_fill_value(pad_widths) + pad_width = {d: (pad, 0)} + variable = variable.pad(pad_width, mode="constant") else: raise TypeError( "{} is invalid for boundary. Valid option is 'exact', " diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index b8a9c5edaf9..b63ff91f28e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4177,6 +4177,113 @@ def test_rank(self): y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=("z",)) assert_equal(y.rank("z", pct=True), y) + def test_pad_constant(self): + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad(dim_0=(1, 3)) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5).astype(np.float32), + mode="constant", + pad_width=((1, 3), (0, 0), (0, 0)), + constant_values=np.nan, + ) + ) + assert actual.shape == (7, 4, 5) + assert_identical(actual, expected) + + def test_pad_coords(self): + ar = DataArray( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + [("x", np.arange(3)), ("y", np.arange(4)), ("z", np.arange(5))], + ) + actual = ar.pad(x=(1, 3), constant_values=1) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + mode="constant", + pad_width=((1, 3), (0, 0), (0, 0)), + constant_values=1, + ), + [ + ( + "x", + np.pad( + np.arange(3).astype(np.float32), + mode="constant", + pad_width=(1, 3), + constant_values=np.nan, + ), + ), + ("y", np.arange(4)), + ("z", np.arange(5)), + ], + ) + assert_identical(actual, expected) + + @pytest.mark.parametrize("mode", ("minimum", "maximum", "mean", "median")) + @pytest.mark.parametrize( + "stat_length", (None, 3, (1, 3), {"dim_0": (2, 1), "dim_2": (4, 2)}) + ) + def test_pad_stat_length(self, mode, stat_length): + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad(dim_0=(1, 3), dim_2=(2, 2), mode=mode, stat_length=stat_length) + if isinstance(stat_length, dict): + stat_length = (stat_length["dim_0"], (4, 4), stat_length["dim_2"]) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + pad_width=((1, 3), (0, 0), (2, 2)), + mode=mode, + stat_length=stat_length, + ) + ) + assert actual.shape == (7, 4, 9) + assert_identical(actual, expected) + + @pytest.mark.parametrize( + "end_values", (None, 3, (3, 5), {"dim_0": (2, 1), "dim_2": (4, 2)}) + ) + def test_pad_linear_ramp(self, end_values): + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad( + dim_0=(1, 3), dim_2=(2, 2), mode="linear_ramp", end_values=end_values + ) + if end_values is None: + end_values = 0 + elif isinstance(end_values, dict): + end_values = (end_values["dim_0"], (4, 4), end_values["dim_2"]) + expected = DataArray( + np.pad( + np.arange(3 * 4 * 5).reshape(3, 4, 5), + pad_width=((1, 3), (0, 0), (2, 2)), + mode="linear_ramp", + end_values=end_values, + ) + ) + assert actual.shape == (7, 4, 9) + assert_identical(actual, expected) + + @pytest.mark.parametrize("mode", ("reflect", "symmetric")) + @pytest.mark.parametrize("reflect_type", (None, "even", "odd")) + def test_pad_reflect(self, mode, reflect_type): + + ar = DataArray(np.arange(3 * 4 * 5).reshape(3, 4, 5)) + actual = ar.pad( + dim_0=(1, 3), dim_2=(2, 2), mode=mode, reflect_type=reflect_type + ) + np_kwargs = { + "array": np.arange(3 * 4 * 5).reshape(3, 4, 5), + "pad_width": ((1, 3), (0, 0), (2, 2)), + "mode": mode, + } + # numpy does not support reflect_type=None + if reflect_type is not None: + np_kwargs["reflect_type"] = reflect_type + expected = DataArray(np.pad(**np_kwargs)) + + assert actual.shape == (7, 4, 9) + assert_identical(actual, expected) + @pytest.fixture(params=[1]) def da(request): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 44ffafb23b1..ab684434aa6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5457,6 +5457,19 @@ def test_ipython_key_completion(self): ds.data_vars[item] # should not raise assert sorted(actual) == sorted(expected) + def test_pad(self): + ds = create_test_data(seed=1) + padded = ds.pad(dim2=(1, 1), constant_values=42) + + assert padded["dim2"].shape == (11,) + assert padded["var1"].shape == (8, 11) + assert padded["var2"].shape == (8, 11) + assert padded["var3"].shape == (10, 8) + assert dict(padded.dims) == {"dim1": 8, "dim2": 11, "dim3": 10, "time": 20} + + np.testing.assert_equal(padded["var1"].isel(dim2=[0, -1]).data, 42) + np.testing.assert_equal(padded["dim2"][[0, -1]].data, np.nan) + # Py.test tests diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 21a212c29b3..09ab1be9af9 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -175,7 +175,7 @@ def test_variable_property(prop): marks=xfail(reason="mixed sparse-dense operation"), ), param( - do("pad_with_fill_value", pad_widths={"x": (1, 1)}, fill_value=5), + do("pad", mode="constant", pad_widths={"x": (1, 1)}, fill_value=5), True, marks=xfail(reason="Missing implementation for np.pad"), ), diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 9f63ebb1d42..bc784c6adc2 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -9,7 +9,7 @@ from xarray.core import formatting from xarray.core.npcompat import IS_NEP18_ACTIVE -from .test_variable import VariableSubclassobjects +from .test_variable import _PAD_XR_NP_ARGS, VariableSubclassobjects pint = pytest.importorskip("pint") DimensionalityError = pint.errors.DimensionalityError @@ -2032,42 +2032,32 @@ def test_no_conflicts(self, unit, dtype): assert expected == actual - def test_pad(self, dtype): + @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS) + def test_pad_constant_values(self, dtype, xr_arg, np_arg): data = np.arange(4 * 3 * 2).reshape(4, 3, 2).astype(dtype) * unit_registry.m v = xr.Variable(["x", "y", "z"], data) - xr_args = [{"x": (2, 1)}, {"y": (0, 3)}, {"x": (3, 1), "z": (2, 0)}] - np_args = [ - ((2, 1), (0, 0), (0, 0)), - ((0, 0), (0, 3), (0, 0)), - ((3, 1), (0, 0), (2, 0)), - ] - for xr_arg, np_arg in zip(xr_args, np_args): - actual = v.pad_with_fill_value(**xr_arg) - expected = xr.Variable( - v.dims, - np.pad( - v.data.astype(float), - np_arg, - mode="constant", - constant_values=np.nan, - ), - ) - xr.testing.assert_identical(expected, actual) - assert_units_equal(expected, actual) - assert isinstance(actual._data, type(v._data)) + actual = v.pad(**xr_arg, mode="constant") + expected = xr.Variable( + v.dims, + np.pad( + v.data.astype(float), np_arg, mode="constant", constant_values=np.nan, + ), + ) + xr.testing.assert_identical(expected, actual) + assert_units_equal(expected, actual) + assert isinstance(actual._data, type(v._data)) # for the boolean array, we pad False data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2) v = xr.Variable(["x", "y", "z"], data) - for xr_arg, np_arg in zip(xr_args, np_args): - actual = v.pad_with_fill_value(fill_value=data.flat[0], **xr_arg) - expected = xr.Variable( - v.dims, - np.pad(v.data, np_arg, mode="constant", constant_values=v.data.flat[0]), - ) - xr.testing.assert_identical(actual, expected) - assert_units_equal(expected, actual) + actual = v.pad(**xr_arg, mode="constant", constant_values=data.flat[0]) + expected = xr.Variable( + v.dims, + np.pad(v.data, np_arg, mode="constant", constant_values=v.data.flat[0]), + ) + xr.testing.assert_identical(actual, expected) + assert_units_equal(expected, actual) @pytest.mark.parametrize( "unit,error", @@ -2089,16 +2079,16 @@ def test_pad(self, dtype): pytest.param(unit_registry.m, None, id="identical_unit"), ), ) - def test_pad_with_fill_value(self, unit, error, dtype): + def test_pad_unit_constant_value(self, unit, error, dtype): array = np.linspace(0, 5, 3 * 10).reshape(3, 10).astype(dtype) * unit_registry.m variable = xr.Variable(("x", "y"), array) fill_value = -100 * unit - func = method("pad_with_fill_value", x=(2, 3), y=(1, 4)) + func = method("pad", mode="constant", x=(2, 3), y=(1, 4)) if error is not None: with pytest.raises(error): - func(variable, fill_value=fill_value) + func(variable, constant_values=fill_value) return @@ -2106,11 +2096,11 @@ def test_pad_with_fill_value(self, unit, error, dtype): expected = attach_units( func( strip_units(variable), - fill_value=strip_units(convert_units(fill_value, units)), + constant_values=strip_units(convert_units(fill_value, units)), ), units, ) - actual = func(variable, fill_value=fill_value) + actual = func(variable, constant_values=fill_value) assert_units_equal(expected, actual) xr.testing.assert_identical(expected, actual) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c86ecd0121f..428ad9280e2 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -38,6 +38,14 @@ source_ndarray, ) +_PAD_XR_NP_ARGS = [ + [{"x": (2, 1)}, ((2, 1), (0, 0), (0, 0))], + [{"x": 1}, ((1, 1), (0, 0), (0, 0))], + [{"y": (0, 3)}, ((0, 0), (0, 3), (0, 0))], + [{"x": (3, 1), "z": (2, 0)}, ((3, 1), (0, 0), (2, 0))], + [{"x": (3, 1), "z": 2}, ((3, 1), (0, 0), (2, 2))], +] + class VariableSubclassobjects: def test_properties(self): @@ -785,36 +793,65 @@ def test_getitem_error(self): with raises_regex(IndexError, "Dimensions of indexers mis"): v[:, ind] - def test_pad(self): + @pytest.mark.parametrize( + "mode", + [ + "mean", + pytest.param( + "median", + marks=pytest.mark.xfail(reason="median is not implemented by Dask"), + ), + pytest.param( + "reflect", marks=pytest.mark.xfail(reason="dask.array.pad bug") + ), + "edge", + pytest.param( + "linear_ramp", + marks=pytest.mark.xfail( + reason="pint bug: https://github.com/hgrecco/pint/issues/1026" + ), + ), + "maximum", + "minimum", + "symmetric", + "wrap", + ], + ) + @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS) + def test_pad(self, mode, xr_arg, np_arg): data = np.arange(4 * 3 * 2).reshape(4, 3, 2) v = self.cls(["x", "y", "z"], data) - xr_args = [{"x": (2, 1)}, {"y": (0, 3)}, {"x": (3, 1), "z": (2, 0)}] - np_args = [ - ((2, 1), (0, 0), (0, 0)), - ((0, 0), (0, 3), (0, 0)), - ((3, 1), (0, 0), (2, 0)), - ] - for xr_arg, np_arg in zip(xr_args, np_args): - actual = v.pad_with_fill_value(**xr_arg) - expected = np.pad( - np.array(v.data.astype(float)), - np_arg, - mode="constant", - constant_values=np.nan, - ) - assert_array_equal(actual, expected) - assert isinstance(actual._data, type(v._data)) + actual = v.pad(mode=mode, **xr_arg) + expected = np.pad(data, np_arg, mode=mode) + + assert_array_equal(actual, expected) + assert isinstance(actual._data, type(v._data)) + + @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS) + def test_pad_constant_values(self, xr_arg, np_arg): + data = np.arange(4 * 3 * 2).reshape(4, 3, 2) + v = self.cls(["x", "y", "z"], data) + + actual = v.pad(**xr_arg) + expected = np.pad( + np.array(v.data.astype(float)), + np_arg, + mode="constant", + constant_values=np.nan, + ) + assert_array_equal(actual, expected) + assert isinstance(actual._data, type(v._data)) # for the boolean array, we pad False data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2) v = self.cls(["x", "y", "z"], data) - for xr_arg, np_arg in zip(xr_args, np_args): - actual = v.pad_with_fill_value(fill_value=False, **xr_arg) - expected = np.pad( - np.array(v.data), np_arg, mode="constant", constant_values=False - ) - assert_array_equal(actual, expected) + + actual = v.pad(mode="constant", constant_values=False, **xr_arg) + expected = np.pad( + np.array(v.data), np_arg, mode="constant", constant_values=False + ) + assert_array_equal(actual, expected) def test_rolling_window(self): # Just a working test. See test_nputils for the algorithm validation @@ -2054,8 +2091,28 @@ def test_getitem_uint(self): super().test_getitem_fancy() @pytest.mark.xfail - def test_pad(self): - super().test_rolling_window() + @pytest.mark.parametrize( + "mode", + [ + "mean", + "median", + "reflect", + "edge", + "linear_ramp", + "maximum", + "minimum", + "symmetric", + "wrap", + ], + ) + @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS) + def test_pad(self, mode, xr_arg, np_arg): + super().test_pad(mode, xr_arg, np_arg) + + @pytest.mark.xfail + @pytest.mark.parametrize("xr_arg, np_arg", _PAD_XR_NP_ARGS) + def test_pad_constant_values(self, xr_arg, np_arg): + super().test_pad_constant_values(xr_arg, np_arg) @pytest.mark.xfail def test_rolling_window(self):