From ccc94a93dad5ed7f90514e996d06eb9e9a3704ae Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Tue, 1 Feb 2022 11:00:38 -0500 Subject: [PATCH 1/7] ENH: Rolling window with step size (GH-15354) --- pandas/_libs/window/indexers.pyi | 3 +- pandas/_libs/window/indexers.pyx | 12 +- pandas/core/generic.py | 3 + pandas/core/indexers/objects.py | 146 +++++++++++++++++----- pandas/core/window/common.py | 11 +- pandas/core/window/ewm.py | 8 +- pandas/core/window/numba_.py | 4 +- pandas/core/window/rolling.py | 203 +++++++++++++++++++++++-------- 8 files changed, 294 insertions(+), 96 deletions(-) diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi index c9bc64be34ac9..6d4f85f350c77 100644 --- a/pandas/_libs/window/indexers.pyi +++ b/pandas/_libs/window/indexers.pyi @@ -8,5 +8,6 @@ def calculate_variable_window_bounds( min_periods, center: bool, closed: str | None, + step: int | None, index: np.ndarray, # const int64_t[:] -) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 4b3a858ade773..64b9cfc6a6979 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -16,6 +16,7 @@ def calculate_variable_window_bounds( object min_periods, # unused but here to match get_window_bounds signature bint center, str closed, + int64_t step, const int64_t[:] index ): """ @@ -38,17 +39,20 @@ def calculate_variable_window_bounds( closed : str string of side of the window that should be closed + step : int64 + Spacing between windows + index : ndarray[int64] time series index to roll over Returns ------- - (ndarray[int64], ndarray[int64]) + (ndarray[int64], ndarray[int64], ndarray[int64]) """ cdef: bint left_closed = False bint right_closed = False - ndarray[int64_t, ndim=1] start, end + ndarray[int64_t, ndim=1] start, end, ref int64_t start_bound, end_bound, index_growth_sign = 1 Py_ssize_t i, j @@ -143,4 +147,6 @@ def calculate_variable_window_bounds( # right endpoint is open if not right_closed and not center: end[i] -= 1 - return start, end + ref = (None if step is None or step == 1 + else np.arange(0, num_values, step, dtype='int64')) + return start[::step], end[::step], ref diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a497475ebd182..b481ad1a90fb7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11263,6 +11263,7 @@ def rolling( on: str | None = None, axis: Axis = 0, closed: str | None = None, + step: int | None = None, method: str = "single", ): axis = self._get_axis_number(axis) @@ -11277,6 +11278,7 @@ def rolling( on=on, axis=axis, closed=closed, + step=step, method=method, ) @@ -11289,6 +11291,7 @@ def rolling( on=on, axis=axis, closed=closed, + step=step, method=method, ) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 4d5e4bbe6bd36..3c0954f607a85 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -27,13 +27,17 @@ center passed from the top level rolling API closed : str, default None closed passed from the top level rolling API +step : int, default None + step passed from the top level rolling API win_type : str, default None win_type passed from the top level rolling API Returns ------- -A tuple of ndarray[int64]s, indicating the boundaries of each -window +A tuple of ndarray[int64]s: +start : array of start boundaries +end : array of end boundaries +ref : array of window reference locations, or None indicating all if step is None or 1 """ @@ -55,6 +59,16 @@ def __init__( for key, value in kwargs.items(): setattr(self, key, value) + def _get_default_ref(self, num_values: int = 0, step: int | None = None): + """ + Returns the default window reference locations. + """ + return ( + None + if step is None or step == 1 + else np.arange(0, num_values, step, dtype="int64") + ) + @Appender(get_window_bounds_doc) def get_window_bounds( self, @@ -66,9 +80,23 @@ def get_window_bounds( raise NotImplementedError + @Appender(get_window_bounds_doc) + def get_window_bounds2( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + + start, end = self.get_window_bounds(num_values, min_periods, center, closed) + ref = self._get_default_ref(num_values, step) + return start[::step], end[::step], ref -class FixedWindowIndexer(BaseIndexer): - """Creates window boundaries that are of fixed length.""" + +class BaseIndexer2(BaseIndexer): + """Base class for window bounds calculations with step optimization.""" @Appender(get_window_bounds_doc) def get_window_bounds( @@ -79,12 +107,43 @@ def get_window_bounds( closed: str | None = None, ) -> tuple[np.ndarray, np.ndarray]: + start, end, ref = self.get_window_bounds2( + num_values, min_periods, center, closed + ) + return start, end + + @Appender(get_window_bounds_doc) + def get_window_bounds2( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + + raise NotImplementedError + + +class FixedWindowIndexer(BaseIndexer2): + """Creates window boundaries that are of fixed length.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds2( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + if center: offset = (self.window_size - 1) // 2 else: offset = 0 - end = np.arange(1 + offset, num_values + 1 + offset, dtype="int64") + end = np.arange(1 + offset, num_values + 1 + offset, step, dtype="int64") start = end - self.window_size if closed in ["left", "both"]: start -= 1 @@ -94,20 +153,22 @@ def get_window_bounds( end = np.clip(end, 0, num_values) start = np.clip(start, 0, num_values) - return start, end + ref = self._get_default_ref(num_values, step) + return start, end, ref -class VariableWindowIndexer(BaseIndexer): +class VariableWindowIndexer(BaseIndexer2): """Creates window boundaries that are of variable length, namely for time series.""" @Appender(get_window_bounds_doc) - def get_window_bounds( + def get_window_bounds2( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: # error: Argument 4 to "calculate_variable_window_bounds" has incompatible # type "Optional[bool]"; expected "bool" @@ -119,6 +180,7 @@ def get_window_bounds( min_periods, center, # type: ignore[arg-type] closed, + step if step is not None else 1, self.index_array, # type: ignore[arg-type] ) @@ -205,25 +267,28 @@ def get_window_bounds( return start, end -class ExpandingIndexer(BaseIndexer): +class ExpandingIndexer(BaseIndexer2): """Calculate expanding window bounds, mimicking df.expanding()""" @Appender(get_window_bounds_doc) - def get_window_bounds( + def get_window_bounds2( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: - return ( - np.zeros(num_values, dtype=np.int64), - np.arange(1, num_values + 1, dtype=np.int64), - ) + if step is None: + step = 1 + end = np.arange(1, num_values + 1, step, dtype=np.int64) + start = np.zeros(len(end), dtype=np.int64) + ref = self._get_default_ref(num_values, step) + return start, end, ref -class FixedForwardWindowIndexer(BaseIndexer): +class FixedForwardWindowIndexer(BaseIndexer2): """ Creates window boundaries for fixed-length windows that include the current row. @@ -250,13 +315,14 @@ class FixedForwardWindowIndexer(BaseIndexer): """ @Appender(get_window_bounds_doc) - def get_window_bounds( + def get_window_bounds2( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: if center: raise ValueError("Forward-looking windows can't have center=True") @@ -264,16 +330,19 @@ def get_window_bounds( raise ValueError( "Forward-looking windows don't support setting the closed argument" ) + if step is None: + step = 1 - start = np.arange(num_values, dtype="int64") + start = np.arange(0, num_values, step, dtype="int64") end = start + self.window_size if self.window_size: - end[-self.window_size :] = num_values + end = np.clip(end, 0, num_values) - return start, end + ref = self._get_default_ref(num_values, step) + return start, end, ref -class GroupbyIndexer(BaseIndexer): +class GroupbyIndexer(BaseIndexer2): """Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()""" def __init__( @@ -313,18 +382,21 @@ def __init__( ) @Appender(get_window_bounds_doc) - def get_window_bounds( + def get_window_bounds2( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: # 1) For each group, get the indices that belong to the group # 2) Use the indices to calculate the start & end bounds of the window # 3) Append the window bounds in group order start_arrays = [] end_arrays = [] + ref_arrays = [] + empty = np.array([], dtype=np.int64) window_indices_start = 0 for key, indices in self.groupby_indices.items(): index_array: np.ndarray | None @@ -338,11 +410,12 @@ def get_window_bounds( window_size=self.window_size, **self.indexer_kwargs, ) - start, end = indexer.get_window_bounds( - len(indices), min_periods, center, closed + start, end, ref = indexer.get_window_bounds2( + len(indices), min_periods, center, closed, step ) start = start.astype(np.int64) end = end.astype(np.int64) + ref = None if ref is None else ref.astype(np.int64) assert len(start) == len( end ), "these should be equal in length from get_window_bounds" @@ -358,21 +431,30 @@ def get_window_bounds( ) start_arrays.append(window_indices.take(ensure_platform_int(start))) end_arrays.append(window_indices.take(ensure_platform_int(end))) + ref_arrays.append( + empty if ref is None else window_indices.take(ensure_platform_int(ref)) + ) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) - return start, end + ref = None if step is None or step == 1 else np.concatenate(ref_arrays) + return start, end, ref -class ExponentialMovingWindowIndexer(BaseIndexer): +class ExponentialMovingWindowIndexer(BaseIndexer2): """Calculate ewm window bounds (the entire window)""" @Appender(get_window_bounds_doc) - def get_window_bounds( + def get_window_bounds2( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: - return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) + return ( + np.array([0], dtype=np.int64), + np.array([num_values], dtype=np.int64), + None, + ) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 15144116fa924..7035a67acaea2 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -22,7 +22,9 @@ def flex_binary_moment(arg1, arg2, f, pairwise=False): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): - result = DataFrame(data, index=frame_template.index) + result = DataFrame( + data, index=None if len(data) > 0 else frame_template.index + ) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result @@ -42,13 +44,16 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg2' columns are not unique") X, Y = arg1.align(arg2, join="outer") X, Y = prep_binary(X, Y) + result_index = X.index res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) - return DataFrame(results, index=X.index, columns=res_columns) + result_index = results[col].index + return DataFrame(results, index=result_index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) + result_index = arg1.index.union(arg2.index) for i in range(len(arg1.columns)): for j in range(len(arg2.columns)): if j < i and arg2 is arg1: @@ -58,10 +63,10 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = f( *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) ) + result_index = results[i][j].index from pandas import concat - result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 4bebc56273805..085c159e5852b 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -418,7 +418,7 @@ def __init__( ) def _check_window_bounds( - self, start: np.ndarray, end: np.ndarray, num_vals: int + self, start: np.ndarray, end: np.ndarray, ref: np.ndarray | None, num_vals: int ) -> None: # emw algorithms are iterative with each point # ExponentialMovingWindowIndexer "bounds" are the entire window @@ -732,11 +732,12 @@ def cov_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end = window_indexer.get_window_bounds( + start, end, ref = window_indexer.get_window_bounds2( num_values=len(x_array), min_periods=min_periods, center=self.center, closed=self.closed, + step=self.step, ) result = window_aggregations.ewmcov( x_array, @@ -798,11 +799,12 @@ def cov_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end = window_indexer.get_window_bounds( + start, end, ref = window_indexer.get_window_bounds2( num_values=len(x_array), min_periods=min_periods, center=self.center, closed=self.closed, + step=self.step, ) def _cov(X, Y): diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 0e8eea3ec671e..4a0f0b33fb063 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -234,8 +234,8 @@ def roll_table( minimum_periods: int, *args: Any, ): - result = np.empty(values.shape) - min_periods_mask = np.empty(values.shape) + result = np.empty((len(begin), values.shape[1])) + min_periods_mask = np.empty(result.shape) for i in numba.prange(len(result)): start = begin[i] stop = end[i] diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index bbd0181e47401..fc8812de0668b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,6 +14,7 @@ Any, Callable, Hashable, + cast, ) import warnings @@ -126,6 +127,7 @@ def __init__( axis: Axis = 0, on: str | Index | None = None, closed: str | None = None, + step: int | None = None, method: str = "single", *, selection=None, @@ -133,6 +135,7 @@ def __init__( self.obj = obj self.on = on self.closed = closed + self.step = step self.window = window self.min_periods = min_periods self.center = center @@ -213,34 +216,54 @@ def _validate(self) -> None: if isinstance(self.window, BaseIndexer): # Validate that the passed BaseIndexer subclass has # a get_window_bounds with the correct signature. - get_window_bounds_signature = inspect.signature( - self.window.get_window_bounds - ).parameters.keys() - expected_signature = inspect.signature( - BaseIndexer().get_window_bounds - ).parameters.keys() - if get_window_bounds_signature != expected_signature: - raise ValueError( - f"{type(self.window).__name__} does not implement " - f"the correct signature for get_window_bounds" - ) + for meth in ["get_window_bounds", "get_window_bounds2"]: + existing_signature = inspect.signature( + getattr(self.window, meth) + ).parameters.keys() + expected_signature = inspect.signature( + getattr(BaseIndexer(), meth) + ).parameters.keys() + if existing_signature != expected_signature: + raise ValueError( + f"{type(self.window).__name__} does not implement " + f"the correct signature for {meth}" + ) if self.method not in ["table", "single"]: raise ValueError("method must be 'table' or 'single") def _check_window_bounds( - self, start: np.ndarray, end: np.ndarray, num_vals: int + self, start: np.ndarray, end: np.ndarray, ref: np.ndarray | None, num_vals: int ) -> None: if len(start) != len(end): raise ValueError( f"start ({len(start)}) and end ({len(end)}) bounds must be the " f"same length" ) - elif len(start) != num_vals: + if ref is not None and len(start) != len(ref): + raise ValueError( + f"start ({len(start)}) and ref ({len(ref)}) arrays must be the " + f"same length" + ) + elif not isinstance(self._get_window_indexer(), GroupbyIndexer) and len( + start + ) != (num_vals + (self.step or 1) - 1) // (self.step or 1): raise ValueError( f"start and end bounds ({len(start)}) must be the same length " - f"as the object ({num_vals})" + f"as the object ({num_vals}) divided by the step ({self.step}) " + f"if given and rounded up unless groupby was used" ) + def _slice_index(self, index: Index, at: np.ndarray | None) -> Index: + """ + Slices the index of the object. + """ + if at is None: + return index + result = index[at] + if isinstance(index, DatetimeIndex): + result.freq = None if index.freq is None else index.freq * (self.step or 1) + return result + def _create_data(self, obj: NDFrameT) -> NDFrameT: """ Split data into blocks & return conformed data. @@ -319,13 +342,14 @@ def __iter__(self): obj = self._create_data(obj) indexer = self._get_window_indexer() - start, end = indexer.get_window_bounds( + start, end, ref = indexer.get_window_bounds2( num_values=len(obj), min_periods=self.min_periods, center=self.center, closed=self.closed, + step=self.step, ) - self._check_window_bounds(start, end, len(obj)) + self._check_window_bounds(start, end, ref, len(obj)) for s, e in zip(start, end): result = obj.iloc[slice(s, e)] @@ -413,7 +437,11 @@ def _get_window_indexer(self) -> BaseIndexer: return FixedWindowIndexer(window_size=self.window) def _apply_series( - self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None + self, + homogeneous_func: Callable[ + ..., ArrayLike | tuple[ArrayLike, np.ndarray | None] + ], + name: str | None = None, ) -> Series: """ Series version of _apply_blockwise @@ -428,11 +456,18 @@ def _apply_series( except (TypeError, NotImplementedError) as err: raise DataError("No numeric types to aggregate") from err - result = homogeneous_func(values) - return obj._constructor(result, index=obj.index, name=obj.name) + result, ref = homogeneous_func(values), None + if type(result) is tuple: + result, ref = result + index = self._slice_index(obj.index, ref) + return obj._constructor(result, index=index, name=obj.name) def _apply_blockwise( - self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None + self, + homogeneous_func: Callable[ + ..., ArrayLike | tuple[ArrayLike, np.ndarray | None] + ], + name: str | None = None, ) -> DataFrame | Series: """ Apply the given function to the DataFrame broken down into homogeneous @@ -447,7 +482,7 @@ def _apply_blockwise( obj = notna(obj).astype(int) obj._mgr = obj._mgr.consolidate() - def hfunc(values: ArrayLike) -> ArrayLike: + def hfunc(values: ArrayLike) -> ArrayLike | tuple[ArrayLike, np.ndarray | None]: values = self._prep_values(values) return homogeneous_func(values) @@ -455,20 +490,32 @@ def hfunc(values: ArrayLike) -> ArrayLike: obj = obj.T taker = [] - res_values = [] + res_values: list[ArrayLike] = [] + ref_value = None for i, arr in enumerate(obj._iter_column_arrays()): # GH#42736 operate column-wise instead of block-wise try: - res = hfunc(arr) + hresult = hfunc(arr) except (TypeError, NotImplementedError): pass else: + res, ref = ( + hresult + if type(hresult) is tuple + else (cast(ArrayLike, hresult), None) + ) + if len(res_values) == 0: + ref_value = ref + elif ((ref_value is None) != (ref is None)) or not np.array_equal( + cast(np.ndarray, ref_value), cast(np.ndarray, ref) + ): + raise ValueError("hfunc returned inconsistent ref value") res_values.append(res) taker.append(i) df = type(obj)._from_arrays( res_values, - index=obj.index, + index=self._slice_index(obj.index, ref_value), columns=obj.columns.take(taker), verify_integrity=False, ) @@ -491,7 +538,11 @@ def hfunc(values: ArrayLike) -> ArrayLike: return self._resolve_output(df, obj) def _apply_tablewise( - self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None + self, + homogeneous_func: Callable[ + ..., ArrayLike | tuple[ArrayLike, np.ndarray | None] + ], + name: str | None = None, ) -> DataFrame | Series: """ Apply the given function to the DataFrame across the entire object @@ -501,9 +552,14 @@ def _apply_tablewise( obj = self._create_data(self._selected_obj) values = self._prep_values(obj.to_numpy()) values = values.T if self.axis == 1 else values - result = homogeneous_func(values) + hresult = homogeneous_func(values) + result, ref = ( + hresult if type(hresult) is tuple else (cast(ArrayLike, hresult), None) + ) result = result.T if self.axis == 1 else result - out = obj._constructor(result, index=obj.index, columns=obj.columns) + index = obj.index if self.axis == 1 else self._slice_index(obj.index, ref) + columns = obj.columns if self.axis != 1 else self._slice_index(obj.columns, ref) + out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) @@ -565,18 +621,19 @@ def homogeneous_func(values: np.ndarray): # calculation function if values.size == 0: - return values.copy() + return values.copy(), np.array([], dtype=np.int64) def calc(x): - start, end = window_indexer.get_window_bounds( + start, end, ref = window_indexer.get_window_bounds2( num_values=len(x), min_periods=min_periods, center=self.center, closed=self.closed, + step=self.step, ) - self._check_window_bounds(start, end, len(x)) + self._check_window_bounds(start, end, ref, len(x)) - return func(x, start, end, min_periods, *numba_args) + return func(x, start, end, min_periods, *numba_args), ref with np.errstate(all="ignore"): result = calc(values) @@ -610,25 +667,30 @@ def _numba_apply( values = self._prep_values(obj.to_numpy()) if values.ndim == 1: values = values.reshape(-1, 1) - start, end = window_indexer.get_window_bounds( + start, end, ref = window_indexer.get_window_bounds2( num_values=len(values), min_periods=min_periods, center=self.center, closed=self.closed, + step=self.step, ) - self._check_window_bounds(start, end, len(values)) + self._check_window_bounds(start, end, ref, len(values)) aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) result = aggregator(values, start, end, min_periods, *func_args) NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator result = result.T if self.axis == 1 else result + index = obj.index if self.axis == 1 else self._slice_index(obj.index, ref) if obj.ndim == 1: result = result.squeeze() - out = obj._constructor(result, index=obj.index, name=obj.name) + out = obj._constructor(result, index=index, name=obj.name) return out else: - out = obj._constructor(result, index=obj.index, columns=obj.columns) + columns = ( + obj.columns if self.axis != 1 else self._slice_index(obj.columns, ref) + ) + out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) def aggregate(self, func, *args, **kwargs): @@ -707,7 +769,7 @@ def _apply( group_indices = self._grouper.indices.values() if group_indices: - indexer = np.concatenate(list(group_indices)) + indexer = np.concatenate([ind[:: self.step] for ind in group_indices]) else: indexer = np.array([], dtype=np.intp) codes = [c.take(indexer) for c in codes] @@ -730,6 +792,14 @@ def _apply( result = result.reset_index(level=list(range(len(groupby_keys)))) return result + def _adjust_pairwise_result(self, result): + return concat( + [ + result.take(gb_indices[:: self.step]).reindex(result.index) + for gb_indices in self._grouper.indices.values() + ] + ) + def _apply_pairwise( self, target: DataFrame | Series, @@ -753,12 +823,7 @@ def _apply_pairwise( # from flex_binary_moment to a "transform"-like result # per groupby combination old_result_len = len(result) - result = concat( - [ - result.take(gb_indices).reindex(result.index) - for gb_indices in self._grouper.indices.values() - ] - ) + result = self._adjust_pairwise_result(result) gb_pairs = ( com.maybe_make_list(pair) for pair in self._grouper.indices.keys() @@ -781,7 +846,7 @@ def _apply_pairwise( group_indices = self._grouper.indices.values() if group_indices: - indexer = np.concatenate(list(group_indices)) + indexer = np.concatenate([ind[:: self.step] for ind in group_indices]) else: indexer = np.array([], dtype=np.intp) @@ -864,8 +929,8 @@ class Window(BaseWindow): If a BaseIndexer subclass, the window boundaries based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely ``min_periods``, ``center``, and - ``closed`` will be passed to ``get_window_bounds``. + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. min_periods : int, default None Minimum number of observations in window required to have a value; @@ -1059,6 +1124,7 @@ class Window(BaseWindow): "axis", "on", "closed", + "step", "method", ] @@ -1151,7 +1217,7 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name) + return self._apply_blockwise(homogeneous_func, name)[:: self.step] @doc( _shared_docs["aggregate"], @@ -1590,13 +1656,14 @@ def cov_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end = window_indexer.get_window_bounds( + start, end, ref = window_indexer.get_window_bounds2( num_values=len(x_array), min_periods=min_periods, center=self.center, closed=self.closed, + step=self.step, ) - self._check_window_bounds(start, end, len(x_array)) + self._check_window_bounds(start, end, ref, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( @@ -1608,7 +1675,7 @@ def cov_func(x, y): notna(x_array + y_array).astype(np.float64), start, end, 0 ) result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof)) - return Series(result, index=x.index, name=x.name) + return Series(result, index=self._slice_index(x.index, ref), name=x.name) return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) @@ -1631,13 +1698,14 @@ def corr_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end = window_indexer.get_window_bounds( + start, end, ref = window_indexer.get_window_bounds2( num_values=len(x_array), min_periods=min_periods, center=self.center, closed=self.closed, + step=self.step, ) - self._check_window_bounds(start, end, len(x_array)) + self._check_window_bounds(start, end, ref, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( @@ -1659,7 +1727,7 @@ def corr_func(x, y): ) denominator = (x_var * y_var) ** 0.5 result = numerator / denominator - return Series(result, index=x.index, name=x.name) + return Series(result, index=self._slice_index(x.index, ref), name=x.name) return self._apply_pairwise(self._selected_obj, other, pairwise, corr_func) @@ -1674,6 +1742,7 @@ class Rolling(RollingAndExpandingMixin): "axis", "on", "closed", + "step", "method", ] @@ -2597,6 +2666,17 @@ class RollingGroupby(BaseWindowGroupby, Rolling): _attributes = Rolling._attributes + BaseWindowGroupby._attributes + def _slice_index(self, index: Index, at: np.ndarray | None) -> Index: + """ + Slices the index of the object. + """ + if at is None: + return index + result = index[at] + if isinstance(index, DatetimeIndex): + result.freq = None + return result + def _get_window_indexer(self) -> GroupbyIndexer: """ Return an indexer class that will compute the window start and end bounds @@ -2639,3 +2719,22 @@ def _validate_monotonic(self): or self._on.hasnans ): self._raise_monotonic_error() + + def _adjust_pairwise_result(self, result): + gb_lens = np.array( + [ + len(gb_indices[:: self.step]) + for gb_indices in self._grouper.indices.values() + ], + dtype=np.int64, + ) + gb_ends = np.cumsum(gb_lens) + gb_starts = np.hstack((0, gb_ends[:-1])) if len(gb_ends) > 0 else gb_ends + return concat( + [ + result.take( + np.arange(gb_starts[i], gb_ends[i], dtype=np.int64) + ).reindex(result.index) + for i in range(len(gb_ends)) + ] + ) From 21eea5b1476389f6bb92d20cfa560c6e28fd6b17 Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Fri, 11 Feb 2022 12:49:19 -0500 Subject: [PATCH 2/7] Revert to original get_window_bounds signature --- pandas/_libs/window/indexers.pyi | 2 +- pandas/_libs/window/indexers.pyx | 4 +- pandas/core/indexers/objects.py | 117 ++++----------- pandas/core/window/ewm.py | 6 +- pandas/core/window/rolling.py | 181 ++++++----------------- pandas/tests/window/test_base_indexer.py | 22 ++- pandas/tests/window/test_groupby.py | 7 +- pandas/tests/window/test_rolling.py | 2 +- pandas/tests/window/test_win_type.py | 2 +- 9 files changed, 104 insertions(+), 239 deletions(-) diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi index 6d4f85f350c77..bbb5e6988d0b5 100644 --- a/pandas/_libs/window/indexers.pyi +++ b/pandas/_libs/window/indexers.pyi @@ -10,4 +10,4 @@ def calculate_variable_window_bounds( closed: str | None, step: int | None, index: np.ndarray, # const int64_t[:] -) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 64b9cfc6a6979..736c15d900185 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -147,6 +147,4 @@ def calculate_variable_window_bounds( # right endpoint is open if not right_closed and not center: end[i] -= 1 - ref = (None if step is None or step == 1 - else np.arange(0, num_values, step, dtype='int64')) - return start[::step], end[::step], ref + return start[::step], end[::step] diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 3c0954f607a85..d1a7e200bf307 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -59,16 +59,6 @@ def __init__( for key, value in kwargs.items(): setattr(self, key, value) - def _get_default_ref(self, num_values: int = 0, step: int | None = None): - """ - Returns the default window reference locations. - """ - return ( - None - if step is None or step == 1 - else np.arange(0, num_values, step, dtype="int64") - ) - @Appender(get_window_bounds_doc) def get_window_bounds( self, @@ -76,67 +66,24 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - ) -> tuple[np.ndarray, np.ndarray]: - - raise NotImplementedError - - @Appender(get_window_bounds_doc) - def get_window_bounds2( - self, - num_values: int = 0, - min_periods: int | None = None, - center: bool | None = None, - closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: - - start, end = self.get_window_bounds(num_values, min_periods, center, closed) - ref = self._get_default_ref(num_values, step) - return start[::step], end[::step], ref - - -class BaseIndexer2(BaseIndexer): - """Base class for window bounds calculations with step optimization.""" - - @Appender(get_window_bounds_doc) - def get_window_bounds( - self, - num_values: int = 0, - min_periods: int | None = None, - center: bool | None = None, - closed: str | None = None, ) -> tuple[np.ndarray, np.ndarray]: - start, end, ref = self.get_window_bounds2( - num_values, min_periods, center, closed - ) - return start, end - - @Appender(get_window_bounds_doc) - def get_window_bounds2( - self, - num_values: int = 0, - min_periods: int | None = None, - center: bool | None = None, - closed: str | None = None, - step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: - raise NotImplementedError -class FixedWindowIndexer(BaseIndexer2): +class FixedWindowIndexer(BaseIndexer): """Creates window boundaries that are of fixed length.""" @Appender(get_window_bounds_doc) - def get_window_bounds2( + def get_window_bounds( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + ) -> tuple[np.ndarray, np.ndarray]: if center: offset = (self.window_size - 1) // 2 @@ -153,22 +100,21 @@ def get_window_bounds2( end = np.clip(end, 0, num_values) start = np.clip(start, 0, num_values) - ref = self._get_default_ref(num_values, step) - return start, end, ref + return start, end -class VariableWindowIndexer(BaseIndexer2): +class VariableWindowIndexer(BaseIndexer): """Creates window boundaries that are of variable length, namely for time series.""" @Appender(get_window_bounds_doc) - def get_window_bounds2( + def get_window_bounds( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + ) -> tuple[np.ndarray, np.ndarray]: # error: Argument 4 to "calculate_variable_window_bounds" has incompatible # type "Optional[bool]"; expected "bool" @@ -207,6 +153,7 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, + step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: # if windows is variable, default is 'right', otherwise default is 'both' @@ -264,31 +211,30 @@ def get_window_bounds( if not right_closed: end[i] -= 1 - return start, end + return start[::step], end[::step] -class ExpandingIndexer(BaseIndexer2): +class ExpandingIndexer(BaseIndexer): """Calculate expanding window bounds, mimicking df.expanding()""" @Appender(get_window_bounds_doc) - def get_window_bounds2( + def get_window_bounds( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + ) -> tuple[np.ndarray, np.ndarray]: if step is None: step = 1 end = np.arange(1, num_values + 1, step, dtype=np.int64) start = np.zeros(len(end), dtype=np.int64) - ref = self._get_default_ref(num_values, step) - return start, end, ref + return start[::step], end[::step] -class FixedForwardWindowIndexer(BaseIndexer2): +class FixedForwardWindowIndexer(BaseIndexer): """ Creates window boundaries for fixed-length windows that include the current row. @@ -315,14 +261,14 @@ class FixedForwardWindowIndexer(BaseIndexer2): """ @Appender(get_window_bounds_doc) - def get_window_bounds2( + def get_window_bounds( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + ) -> tuple[np.ndarray, np.ndarray]: if center: raise ValueError("Forward-looking windows can't have center=True") @@ -338,11 +284,10 @@ def get_window_bounds2( if self.window_size: end = np.clip(end, 0, num_values) - ref = self._get_default_ref(num_values, step) - return start, end, ref + return start, end -class GroupbyIndexer(BaseIndexer2): +class GroupbyIndexer(BaseIndexer): """Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()""" def __init__( @@ -382,21 +327,21 @@ def __init__( ) @Appender(get_window_bounds_doc) - def get_window_bounds2( + def get_window_bounds( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + ) -> tuple[np.ndarray, np.ndarray]: + if step not in [None, 1]: + raise NotImplementedError(f"unsupported step: {step}") # 1) For each group, get the indices that belong to the group # 2) Use the indices to calculate the start & end bounds of the window # 3) Append the window bounds in group order start_arrays = [] end_arrays = [] - ref_arrays = [] - empty = np.array([], dtype=np.int64) window_indices_start = 0 for key, indices in self.groupby_indices.items(): index_array: np.ndarray | None @@ -410,12 +355,11 @@ def get_window_bounds2( window_size=self.window_size, **self.indexer_kwargs, ) - start, end, ref = indexer.get_window_bounds2( + start, end = indexer.get_window_bounds( len(indices), min_periods, center, closed, step ) start = start.astype(np.int64) end = end.astype(np.int64) - ref = None if ref is None else ref.astype(np.int64) assert len(start) == len( end ), "these should be equal in length from get_window_bounds" @@ -431,30 +375,27 @@ def get_window_bounds2( ) start_arrays.append(window_indices.take(ensure_platform_int(start))) end_arrays.append(window_indices.take(ensure_platform_int(end))) - ref_arrays.append( - empty if ref is None else window_indices.take(ensure_platform_int(ref)) - ) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) - ref = None if step is None or step == 1 else np.concatenate(ref_arrays) - return start, end, ref + return start, end -class ExponentialMovingWindowIndexer(BaseIndexer2): +class ExponentialMovingWindowIndexer(BaseIndexer): """Calculate ewm window bounds (the entire window)""" @Appender(get_window_bounds_doc) - def get_window_bounds2( + def get_window_bounds( self, num_values: int = 0, min_periods: int | None = None, center: bool | None = None, closed: str | None = None, step: int | None = None, - ) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]: + ) -> tuple[np.ndarray, np.ndarray]: + if step not in [None, 1]: + raise NotImplementedError(f"unsupported step: {step}") return ( np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64), - None, ) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 085c159e5852b..02328afcfd3f2 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -418,7 +418,7 @@ def __init__( ) def _check_window_bounds( - self, start: np.ndarray, end: np.ndarray, ref: np.ndarray | None, num_vals: int + self, start: np.ndarray, end: np.ndarray, num_vals: int ) -> None: # emw algorithms are iterative with each point # ExponentialMovingWindowIndexer "bounds" are the entire window @@ -732,7 +732,7 @@ def cov_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end, ref = window_indexer.get_window_bounds2( + start, end = window_indexer.get_window_bounds( num_values=len(x_array), min_periods=min_periods, center=self.center, @@ -799,7 +799,7 @@ def cov_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end, ref = window_indexer.get_window_bounds2( + start, end = window_indexer.get_window_bounds( num_values=len(x_array), min_periods=min_periods, center=self.center, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fc8812de0668b..30881c703dc6c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,7 +14,6 @@ Any, Callable, Hashable, - cast, ) import warnings @@ -216,34 +215,28 @@ def _validate(self) -> None: if isinstance(self.window, BaseIndexer): # Validate that the passed BaseIndexer subclass has # a get_window_bounds with the correct signature. - for meth in ["get_window_bounds", "get_window_bounds2"]: - existing_signature = inspect.signature( - getattr(self.window, meth) - ).parameters.keys() - expected_signature = inspect.signature( - getattr(BaseIndexer(), meth) - ).parameters.keys() - if existing_signature != expected_signature: - raise ValueError( - f"{type(self.window).__name__} does not implement " - f"the correct signature for {meth}" - ) + get_window_bounds_signature = inspect.signature( + self.window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(self.window).__name__} does not implement " + f"the correct signature for get_window_bounds" + ) if self.method not in ["table", "single"]: raise ValueError("method must be 'table' or 'single") def _check_window_bounds( - self, start: np.ndarray, end: np.ndarray, ref: np.ndarray | None, num_vals: int + self, start: np.ndarray, end: np.ndarray, num_vals: int ) -> None: if len(start) != len(end): raise ValueError( f"start ({len(start)}) and end ({len(end)}) bounds must be the " f"same length" ) - if ref is not None and len(start) != len(ref): - raise ValueError( - f"start ({len(start)}) and ref ({len(ref)}) arrays must be the " - f"same length" - ) elif not isinstance(self._get_window_indexer(), GroupbyIndexer) and len( start ) != (num_vals + (self.step or 1) - 1) // (self.step or 1): @@ -253,17 +246,6 @@ def _check_window_bounds( f"if given and rounded up unless groupby was used" ) - def _slice_index(self, index: Index, at: np.ndarray | None) -> Index: - """ - Slices the index of the object. - """ - if at is None: - return index - result = index[at] - if isinstance(index, DatetimeIndex): - result.freq = None if index.freq is None else index.freq * (self.step or 1) - return result - def _create_data(self, obj: NDFrameT) -> NDFrameT: """ Split data into blocks & return conformed data. @@ -342,14 +324,14 @@ def __iter__(self): obj = self._create_data(obj) indexer = self._get_window_indexer() - start, end, ref = indexer.get_window_bounds2( + start, end = indexer.get_window_bounds( num_values=len(obj), min_periods=self.min_periods, center=self.center, closed=self.closed, step=self.step, ) - self._check_window_bounds(start, end, ref, len(obj)) + self._check_window_bounds(start, end, len(obj)) for s, e in zip(start, end): result = obj.iloc[slice(s, e)] @@ -437,11 +419,7 @@ def _get_window_indexer(self) -> BaseIndexer: return FixedWindowIndexer(window_size=self.window) def _apply_series( - self, - homogeneous_func: Callable[ - ..., ArrayLike | tuple[ArrayLike, np.ndarray | None] - ], - name: str | None = None, + self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ Series version of _apply_blockwise @@ -456,18 +434,11 @@ def _apply_series( except (TypeError, NotImplementedError) as err: raise DataError("No numeric types to aggregate") from err - result, ref = homogeneous_func(values), None - if type(result) is tuple: - result, ref = result - index = self._slice_index(obj.index, ref) - return obj._constructor(result, index=index, name=obj.name) + result = homogeneous_func(values) + return obj._constructor(result, index=obj.index, name=obj.name) def _apply_blockwise( - self, - homogeneous_func: Callable[ - ..., ArrayLike | tuple[ArrayLike, np.ndarray | None] - ], - name: str | None = None, + self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> DataFrame | Series: """ Apply the given function to the DataFrame broken down into homogeneous @@ -482,7 +453,7 @@ def _apply_blockwise( obj = notna(obj).astype(int) obj._mgr = obj._mgr.consolidate() - def hfunc(values: ArrayLike) -> ArrayLike | tuple[ArrayLike, np.ndarray | None]: + def hfunc(values: ArrayLike) -> ArrayLike: values = self._prep_values(values) return homogeneous_func(values) @@ -490,32 +461,20 @@ def hfunc(values: ArrayLike) -> ArrayLike | tuple[ArrayLike, np.ndarray | None]: obj = obj.T taker = [] - res_values: list[ArrayLike] = [] - ref_value = None + res_values = [] for i, arr in enumerate(obj._iter_column_arrays()): # GH#42736 operate column-wise instead of block-wise try: - hresult = hfunc(arr) + res = hfunc(arr) except (TypeError, NotImplementedError): pass else: - res, ref = ( - hresult - if type(hresult) is tuple - else (cast(ArrayLike, hresult), None) - ) - if len(res_values) == 0: - ref_value = ref - elif ((ref_value is None) != (ref is None)) or not np.array_equal( - cast(np.ndarray, ref_value), cast(np.ndarray, ref) - ): - raise ValueError("hfunc returned inconsistent ref value") res_values.append(res) taker.append(i) df = type(obj)._from_arrays( res_values, - index=self._slice_index(obj.index, ref_value), + index=obj.index, columns=obj.columns.take(taker), verify_integrity=False, ) @@ -538,11 +497,7 @@ def hfunc(values: ArrayLike) -> ArrayLike | tuple[ArrayLike, np.ndarray | None]: return self._resolve_output(df, obj) def _apply_tablewise( - self, - homogeneous_func: Callable[ - ..., ArrayLike | tuple[ArrayLike, np.ndarray | None] - ], - name: str | None = None, + self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> DataFrame | Series: """ Apply the given function to the DataFrame across the entire object @@ -552,14 +507,9 @@ def _apply_tablewise( obj = self._create_data(self._selected_obj) values = self._prep_values(obj.to_numpy()) values = values.T if self.axis == 1 else values - hresult = homogeneous_func(values) - result, ref = ( - hresult if type(hresult) is tuple else (cast(ArrayLike, hresult), None) - ) + result = homogeneous_func(values) result = result.T if self.axis == 1 else result - index = obj.index if self.axis == 1 else self._slice_index(obj.index, ref) - columns = obj.columns if self.axis != 1 else self._slice_index(obj.columns, ref) - out = obj._constructor(result, index=index, columns=columns) + out = obj._constructor(result, index=obj.index, columns=obj.columns) return self._resolve_output(out, obj) @@ -621,19 +571,19 @@ def homogeneous_func(values: np.ndarray): # calculation function if values.size == 0: - return values.copy(), np.array([], dtype=np.int64) + return values.copy() def calc(x): - start, end, ref = window_indexer.get_window_bounds2( + start, end = window_indexer.get_window_bounds( num_values=len(x), min_periods=min_periods, center=self.center, closed=self.closed, step=self.step, ) - self._check_window_bounds(start, end, ref, len(x)) + self._check_window_bounds(start, end, len(x)) - return func(x, start, end, min_periods, *numba_args), ref + return func(x, start, end, min_periods, *numba_args) with np.errstate(all="ignore"): result = calc(values) @@ -667,30 +617,26 @@ def _numba_apply( values = self._prep_values(obj.to_numpy()) if values.ndim == 1: values = values.reshape(-1, 1) - start, end, ref = window_indexer.get_window_bounds2( + start, end = window_indexer.get_window_bounds( num_values=len(values), min_periods=min_periods, center=self.center, closed=self.closed, step=self.step, ) - self._check_window_bounds(start, end, ref, len(values)) + self._check_window_bounds(start, end, len(values)) aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) result = aggregator(values, start, end, min_periods, *func_args) NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator result = result.T if self.axis == 1 else result - index = obj.index if self.axis == 1 else self._slice_index(obj.index, ref) if obj.ndim == 1: result = result.squeeze() - out = obj._constructor(result, index=index, name=obj.name) + out = obj._constructor(result, index=obj.index, name=obj.name) return out else: - columns = ( - obj.columns if self.axis != 1 else self._slice_index(obj.columns, ref) - ) - out = obj._constructor(result, index=index, columns=columns) + out = obj._constructor(result, index=obj.index, columns=obj.columns) return self._resolve_output(out, obj) def aggregate(self, func, *args, **kwargs): @@ -769,7 +715,7 @@ def _apply( group_indices = self._grouper.indices.values() if group_indices: - indexer = np.concatenate([ind[:: self.step] for ind in group_indices]) + indexer = np.concatenate(list(group_indices)) else: indexer = np.array([], dtype=np.intp) codes = [c.take(indexer) for c in codes] @@ -792,14 +738,6 @@ def _apply( result = result.reset_index(level=list(range(len(groupby_keys)))) return result - def _adjust_pairwise_result(self, result): - return concat( - [ - result.take(gb_indices[:: self.step]).reindex(result.index) - for gb_indices in self._grouper.indices.values() - ] - ) - def _apply_pairwise( self, target: DataFrame | Series, @@ -823,7 +761,12 @@ def _apply_pairwise( # from flex_binary_moment to a "transform"-like result # per groupby combination old_result_len = len(result) - result = self._adjust_pairwise_result(result) + result = concat( + [ + result.take(gb_indices).reindex(result.index) + for gb_indices in self._grouper.indices.values() + ] + ) gb_pairs = ( com.maybe_make_list(pair) for pair in self._grouper.indices.keys() @@ -846,7 +789,7 @@ def _apply_pairwise( group_indices = self._grouper.indices.values() if group_indices: - indexer = np.concatenate([ind[:: self.step] for ind in group_indices]) + indexer = np.concatenate(list(group_indices)) else: indexer = np.array([], dtype=np.intp) @@ -1656,14 +1599,14 @@ def cov_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end, ref = window_indexer.get_window_bounds2( + start, end = window_indexer.get_window_bounds( num_values=len(x_array), min_periods=min_periods, center=self.center, closed=self.closed, step=self.step, ) - self._check_window_bounds(start, end, ref, len(x_array)) + self._check_window_bounds(start, end, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( @@ -1675,7 +1618,7 @@ def cov_func(x, y): notna(x_array + y_array).astype(np.float64), start, end, 0 ) result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof)) - return Series(result, index=self._slice_index(x.index, ref), name=x.name) + return Series(result, index=x.index, name=x.name) return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) @@ -1698,14 +1641,14 @@ def corr_func(x, y): if self.min_periods is not None else window_indexer.window_size ) - start, end, ref = window_indexer.get_window_bounds2( + start, end = window_indexer.get_window_bounds( num_values=len(x_array), min_periods=min_periods, center=self.center, closed=self.closed, step=self.step, ) - self._check_window_bounds(start, end, ref, len(x_array)) + self._check_window_bounds(start, end, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( @@ -1727,7 +1670,7 @@ def corr_func(x, y): ) denominator = (x_var * y_var) ** 0.5 result = numerator / denominator - return Series(result, index=self._slice_index(x.index, ref), name=x.name) + return Series(result, index=x.index, name=x.name) return self._apply_pairwise(self._selected_obj, other, pairwise, corr_func) @@ -2666,17 +2609,6 @@ class RollingGroupby(BaseWindowGroupby, Rolling): _attributes = Rolling._attributes + BaseWindowGroupby._attributes - def _slice_index(self, index: Index, at: np.ndarray | None) -> Index: - """ - Slices the index of the object. - """ - if at is None: - return index - result = index[at] - if isinstance(index, DatetimeIndex): - result.freq = None - return result - def _get_window_indexer(self) -> GroupbyIndexer: """ Return an indexer class that will compute the window start and end bounds @@ -2719,22 +2651,3 @@ def _validate_monotonic(self): or self._on.hasnans ): self._raise_monotonic_error() - - def _adjust_pairwise_result(self, result): - gb_lens = np.array( - [ - len(gb_indices[:: self.step]) - for gb_indices in self._grouper.indices.values() - ], - dtype=np.int64, - ) - gb_ends = np.cumsum(gb_lens) - gb_starts = np.hstack((0, gb_ends[:-1])) if len(gb_ends) > 0 else gb_ends - return concat( - [ - result.take( - np.arange(gb_starts[i], gb_ends[i], dtype=np.int64) - ).reindex(result.index) - for i in range(len(gb_ends)) - ] - ) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 5593aa8351c69..c58d64dd3c2bb 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -46,7 +46,7 @@ def test_indexer_constructor_arg(): df = DataFrame({"values": range(5)}) class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): @@ -68,11 +68,17 @@ def test_indexer_accepts_rolling_args(): df = DataFrame({"values": range(5)}) class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): - if center and min_periods == 1 and closed == "both" and i == 2: + if ( + center + and min_periods == 1 + and closed == "both" + and step == 1 + and i == 2 + ): start[i] = 0 end[i] = num_values else: @@ -81,7 +87,9 @@ def get_window_bounds(self, num_values, min_periods, center, closed): return start, end indexer = CustomIndexer(window_size=1) - result = df.rolling(indexer, center=True, min_periods=1, closed="both").sum() + result = df.rolling( + indexer, center=True, min_periods=1, closed="both", step=1 + ).sum() expected = DataFrame({"values": [0.0, 1.0, 10.0, 3.0, 4.0]}) tm.assert_frame_equal(result, expected) @@ -277,7 +285,7 @@ def test_fixed_forward_indexer_count(): def test_indexer_quantile_sum(end_value, values, func, args): # GH 37153 class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): @@ -456,7 +464,7 @@ def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size): def test_unequal_start_end_bounds(): class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): return np.array([1]), np.array([1, 2]) indexer = CustomIndexer() @@ -478,7 +486,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): def test_unequal_bounds_to_object(): # GH 44470 class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): return np.array([1]), np.array([2]) indexer = CustomIndexer() diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 5125587df9ea2..697a6c5da8cfe 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -446,7 +446,12 @@ def test_groupby_rolling_custom_indexer(self): # GH 35557 class SimpleIndexer(BaseIndexer): def get_window_bounds( - self, num_values=0, min_periods=None, center=None, closed=None + self, + num_values=0, + min_periods=None, + center=None, + closed=None, + step=None, ): min_periods = self.window_size if min_periods is None else 0 end = np.arange(num_values, dtype=np.int64) + 1 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index ced163178f73a..4f68538f6905e 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1362,7 +1362,7 @@ def test_rolling_non_monotonic(method, expected): df = DataFrame({"values": np.arange(len(use_expanding)) ** 2}) class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 03ea745d9cb86..adb3fffbd5750 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -173,7 +173,7 @@ def test_win_type_freq_return_deprecation(): @td.skip_if_no_scipy def test_win_type_not_implemented(): class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, step): return np.array([0, 1]), np.array([1, 2]) df = DataFrame({"values": range(2)}) From ebb27f559a11ea19fc11ee9d1d997f53385c7fdb Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Sat, 12 Feb 2022 15:12:02 -0500 Subject: [PATCH 3/7] Add tests for step-parameter --- pandas/core/window/rolling.py | 34 +++++-- pandas/tests/window/conftest.py | 25 ++++++ pandas/tests/window/test_api.py | 89 ++++++++++--------- pandas/tests/window/test_apply.py | 56 +++++++----- pandas/tests/window/test_base_indexer.py | 36 ++++---- pandas/tests/window/test_dtypes.py | 20 +++-- pandas/tests/window/test_numba.py | 54 ++++++----- pandas/tests/window/test_rolling.py | 70 ++++++++------- pandas/tests/window/test_rolling_functions.py | 60 +++++++------ pandas/tests/window/test_rolling_quantile.py | 20 +++-- pandas/tests/window/test_rolling_skew_kurt.py | 42 +++++---- pandas/tests/window/test_win_type.py | 70 ++++++++------- 12 files changed, 342 insertions(+), 234 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 30881c703dc6c..fdca3f88e9128 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -435,7 +435,8 @@ def _apply_series( raise DataError("No numeric types to aggregate") from err result = homogeneous_func(values) - return obj._constructor(result, index=obj.index, name=obj.name) + index = obj.index if len(result) == len(obj.index) else obj.index[:: self.step] + return obj._constructor(result, index=index, name=obj.name) def _apply_blockwise( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None @@ -472,9 +473,14 @@ def hfunc(values: ArrayLike) -> ArrayLike: res_values.append(res) taker.append(i) + index = ( + obj.index + if len(res_values) == 0 or len(res_values[0]) == len(obj.index) + else obj.index[:: self.step] + ) df = type(obj)._from_arrays( res_values, - index=obj.index, + index=index, columns=obj.columns.take(taker), verify_integrity=False, ) @@ -509,7 +515,13 @@ def _apply_tablewise( values = values.T if self.axis == 1 else values result = homogeneous_func(values) result = result.T if self.axis == 1 else result - out = obj._constructor(result, index=obj.index, columns=obj.columns) + index = obj.index if len(result) == len(obj.index) else obj.index[:: self.step] + columns = ( + obj.columns + if result.shape[1] == len(obj.columns) + else obj.columns[:: self.step] + ) + out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) @@ -631,12 +643,18 @@ def _numba_apply( result = aggregator(values, start, end, min_periods, *func_args) NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator result = result.T if self.axis == 1 else result + index = obj.index if len(result) == len(obj.index) else obj.index[:: self.step] if obj.ndim == 1: result = result.squeeze() - out = obj._constructor(result, index=obj.index, name=obj.name) + out = obj._constructor(result, index=index, name=obj.name) return out else: - out = obj._constructor(result, index=obj.index, columns=obj.columns) + columns = ( + obj.columns + if result.shape[1] == len(obj.columns) + else obj.columns[:: self.step] + ) + out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) def aggregate(self, func, *args, **kwargs): @@ -1588,6 +1606,9 @@ def cov( ddof: int = 1, **kwargs, ): + if self.step not in [None, 1]: + raise NotImplementedError(f"invalid step: {self.step}") + from pandas import Series def cov_func(x, y): @@ -1630,6 +1651,9 @@ def corr( **kwargs, ): + if self.step not in [None, 1]: + raise NotImplementedError(f"invalid step: {self.step}") + from pandas import Series def corr_func(x, y): diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index f2832652ed58f..715e4bcf6feaf 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -126,3 +126,28 @@ def frame(): index=bdate_range(datetime(2009, 1, 1), periods=100), columns=np.arange(10), ) + + +@pytest.fixture(params=[None, 1, 2, 5, 10]) +def step(request): + """step keyword argument for rolling window operations.""" + return request.param + + +@pytest.fixture +def step_methods(): + """Make a step-argument helper as fixture.""" + + class StepMethods: + @staticmethod + def get_selected_indices(step, group_keys): + """Return step-selected indices within groups.""" + step = step or 1 + group_ind = {} + for i, key in enumerate(group_keys): + group_ind.setdefault(key, []).append(i) + return sorted( + ind[j] for ind in group_ind.values() for j in range(0, len(ind), step) + ) + + return StepMethods diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index f84a579247630..6dbcc8dfd00c0 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -16,20 +16,20 @@ from pandas.core.base import SpecificationError -def test_getitem(): +def test_getitem(step): frame = DataFrame(np.random.randn(5, 5)) - r = frame.rolling(window=5) - tm.assert_index_equal(r._selected_obj.columns, frame.columns) + r = frame.rolling(window=5, step=step) + tm.assert_index_equal(r._selected_obj.columns, frame[::step].columns) - r = frame.rolling(window=5)[1] - assert r._selected_obj.name == frame.columns[1] + r = frame.rolling(window=5, step=step)[1] + assert r._selected_obj.name == frame[::step].columns[1] # technically this is allowed - r = frame.rolling(window=5)[1, 3] - tm.assert_index_equal(r._selected_obj.columns, frame.columns[[1, 3]]) + r = frame.rolling(window=5, step=step)[1, 3] + tm.assert_index_equal(r._selected_obj.columns, frame[::step].columns[[1, 3]]) - r = frame.rolling(window=5)[[1, 3]] - tm.assert_index_equal(r._selected_obj.columns, frame.columns[[1, 3]]) + r = frame.rolling(window=5, step=step)[[1, 3]] + tm.assert_index_equal(r._selected_obj.columns, frame[::step].columns[[1, 3]]) def test_select_bad_cols(): @@ -53,21 +53,21 @@ def test_attribute_access(): r.F -def tests_skip_nuisance(): +def tests_skip_nuisance(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) - r = df.rolling(window=3) + r = df.rolling(window=3, step=step) result = r[["A", "B"]].sum() expected = DataFrame( {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, columns=list("AB"), - ) + )[::step] tm.assert_frame_equal(result, expected) -def test_skip_sum_object_raises(): +def test_skip_sum_object_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) - r = df.rolling(window=3) + r = df.rolling(window=3, step=step) msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#42738 @@ -75,14 +75,14 @@ def test_skip_sum_object_raises(): expected = DataFrame( {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, columns=list("AB"), - ) + )[::step] tm.assert_frame_equal(result, expected) -def test_agg(): +def test_agg(step): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) - r = df.rolling(window=3) + r = df.rolling(window=3, step=step) a_mean = r["A"].mean() a_std = r["A"].std() a_sum = r["A"].sum() @@ -141,10 +141,10 @@ def test_agg_apply(raw): tm.assert_frame_equal(result, expected, check_like=True) -def test_agg_consistency(): +def test_agg_consistency(step): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) - r = df.rolling(window=3) + r = df.rolling(window=3, step=step) result = r.agg([np.sum, np.mean]).columns expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) @@ -182,7 +182,7 @@ def test_agg_nested_dicts(): r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) -def test_count_nonnumeric_types(): +def test_count_nonnumeric_types(step): # GH12541 cols = [ "int", @@ -239,13 +239,13 @@ def test_count_nonnumeric_types(): "periods_nat": [1.0, 2.0, 1.0], }, columns=cols, - ) + )[::step] - result = df.rolling(window=2, min_periods=0).count() + result = df.rolling(window=2, min_periods=0, step=step).count() tm.assert_frame_equal(result, expected) - result = df.rolling(1, min_periods=0).count() - expected = df.notna().astype(float) + result = df.rolling(1, min_periods=0, step=step).count() + expected = df.notna().astype(float)[::step] tm.assert_frame_equal(result, expected) @@ -339,11 +339,11 @@ def test_validate_deprecated(): @pytest.mark.filterwarnings("ignore:min_periods:FutureWarning") def test_dont_modify_attributes_after_methods( - arithmetic_win_operators, closed, center, min_periods + arithmetic_win_operators, closed, center, min_periods, step ): # GH 39554 roll_obj = Series(range(1)).rolling( - 1, center=center, closed=closed, min_periods=min_periods + 1, center=center, closed=closed, min_periods=min_periods, step=step ) expected = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes} getattr(roll_obj, arithmetic_win_operators)() @@ -351,40 +351,49 @@ def test_dont_modify_attributes_after_methods( assert result == expected -def test_centered_axis_validation(): +def test_centered_axis_validation(step): # ok - Series(np.ones(10)).rolling(window=3, center=True, axis=0).mean() + Series(np.ones(10)).rolling(window=3, center=True, axis=0, step=step).mean() # bad axis msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): - Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean() + Series(np.ones(10)).rolling(window=3, center=True, axis=1, step=step).mean() # ok ok - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=0).mean() - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=1).mean() + DataFrame(np.ones((10, 10))).rolling( + window=3, center=True, axis=0, step=step + ).mean() + DataFrame(np.ones((10, 10))).rolling( + window=3, center=True, axis=1, step=step + ).mean() # bad axis msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): - (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) + ( + DataFrame(np.ones((10, 10))) + .rolling(window=3, center=True, axis=2, step=step) + .mean() + ) -def test_rolling_min_min_periods(): +def test_rolling_min_min_periods(step): a = Series([1, 2, 3, 4, 5]) - result = a.rolling(window=100, min_periods=1).min() - expected = Series(np.ones(len(a))) + result = a.rolling(window=100, min_periods=1, step=step).min() + expected = Series(np.ones(len(a)))[::step] tm.assert_series_equal(result, expected) msg = "min_periods 5 must be <= window 3" with pytest.raises(ValueError, match=msg): - Series([1, 2, 3]).rolling(window=3, min_periods=5).min() + Series([1, 2, 3]).rolling(window=3, min_periods=5, step=step).min() -def test_rolling_max_min_periods(): +def test_rolling_max_min_periods(step): a = Series([1, 2, 3, 4, 5], dtype=np.float64) - b = a.rolling(window=100, min_periods=1).max() - tm.assert_almost_equal(a, b) + result = a.rolling(window=100, min_periods=1, step=step).max() + expected = a[::step] + tm.assert_almost_equal(result, expected) msg = "min_periods 5 must be <= window 3" with pytest.raises(ValueError, match=msg): - Series([1, 2, 3]).rolling(window=3, min_periods=5).max() + Series([1, 2, 3]).rolling(window=3, min_periods=5, step=step).max() diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 8e690a677aa98..d4f5fed0a5d3c 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -52,7 +52,7 @@ def test_rolling_apply_out_of_bounds(engine_and_raw): @pytest.mark.parametrize("window", [2, "2s"]) -def test_rolling_apply_with_pandas_objects(window): +def test_rolling_apply_with_pandas_objects(window, step): # 5071 df = DataFrame( {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, @@ -66,32 +66,36 @@ def f(x): return np.nan return x.iloc[-1] - result = df.rolling(window).apply(f, raw=False) - expected = df.iloc[2:].reindex_like(df) + result = df.rolling(window, step=step).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df)[::step] tm.assert_frame_equal(result, expected) with tm.external_error_raised(AttributeError): df.rolling(window).apply(f, raw=True) -def test_rolling_apply(engine_and_raw): +def test_rolling_apply(engine_and_raw, step): engine, raw = engine_and_raw expected = Series([], dtype="float64") - result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) + result = expected.rolling(10, step=step).apply( + lambda x: x.mean(), engine=engine, raw=raw + ) tm.assert_series_equal(result, expected) # gh-8080 s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x), engine=engine, raw=raw) - expected = Series([1.0, 2.0, 2.0]) + result = s.rolling(2, min_periods=0, step=step).apply( + lambda x: len(x), engine=engine, raw=raw + ) + expected = Series([1.0, 2.0, 2.0])[::step] tm.assert_series_equal(result, expected) - result = s.rolling(2, min_periods=0).apply(len, engine=engine, raw=raw) + result = s.rolling(2, min_periods=0, step=step).apply(len, engine=engine, raw=raw) tm.assert_series_equal(result, expected) -def test_all_apply(engine_and_raw): +def test_all_apply(engine_and_raw, step): engine, raw = engine_and_raw df = ( @@ -100,15 +104,15 @@ def test_all_apply(engine_and_raw): ).set_index("A") * 2 ) - er = df.rolling(window=1) - r = df.rolling(window="1s") + er = df.rolling(window=1, step=step) + r = df.rolling(window="1s", step=step) result = r.apply(lambda x: 1, engine=engine, raw=raw) expected = er.apply(lambda x: 1, engine=engine, raw=raw) tm.assert_frame_equal(result, expected) -def test_ragged_apply(engine_and_raw): +def test_ragged_apply(engine_and_raw, step): engine, raw = engine_and_raw df = DataFrame({"B": range(5)}) @@ -121,18 +125,24 @@ def test_ragged_apply(engine_and_raw): ] f = lambda x: 1 - result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw) - expected = df.copy() + result = df.rolling(window="1s", min_periods=1, step=step).apply( + f, engine=engine, raw=raw + ) + expected = df.copy()[::step] expected["B"] = 1.0 tm.assert_frame_equal(result, expected) - result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw) - expected = df.copy() + result = df.rolling(window="2s", min_periods=1, step=step).apply( + f, engine=engine, raw=raw + ) + expected = df.copy()[::step] expected["B"] = 1.0 tm.assert_frame_equal(result, expected) - result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw) - expected = df.copy() + result = df.rolling(window="5s", min_periods=1, step=step).apply( + f, engine=engine, raw=raw + ) + expected = df.copy()[::step] expected["B"] = 1.0 tm.assert_frame_equal(result, expected) @@ -266,9 +276,13 @@ def test_time_rule_frame(raw, frame): @pytest.mark.parametrize("minp", [0, 99, 100]) -def test_min_periods(raw, series, minp): - result = series.rolling(len(series) + 1, min_periods=minp).apply(f, raw=raw) - expected = series.rolling(len(series), min_periods=minp).apply(f, raw=raw) +def test_min_periods(raw, series, minp, step): + result = series.rolling(len(series) + 1, min_periods=minp, step=step).apply( + f, raw=raw + ) + expected = series.rolling(len(series), min_periods=minp, step=step).apply( + f, raw=raw + ) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index c58d64dd3c2bb..76dede02f489d 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -149,7 +149,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed, step): ], ) @pytest.mark.filterwarnings("ignore:min_periods:FutureWarning") -def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): +def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs, step): # GH 32865 values = np.arange(10.0) values[5] = 100.0 @@ -166,11 +166,11 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) rolling = constructor(values).rolling(window=indexer, closed="right") getattr(rolling, func)() - rolling = constructor(values).rolling(window=indexer, min_periods=2) + rolling = constructor(values).rolling(window=indexer, min_periods=2, step=step) result = getattr(rolling, func)() # Check that the function output matches the explicitly provided array - expected = constructor(expected) + expected = constructor(expected)[::step] tm.assert_equal(result, expected) # Check that the rolling function output matches applying an alternative @@ -190,12 +190,12 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) @pytest.mark.parametrize("constructor", [Series, DataFrame]) -def test_rolling_forward_skewness(constructor): +def test_rolling_forward_skewness(constructor, step): values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=5) - rolling = constructor(values).rolling(window=indexer, min_periods=3) + rolling = constructor(values).rolling(window=indexer, min_periods=3, step=step) result = rolling.skew() expected = constructor( @@ -211,7 +211,7 @@ def test_rolling_forward_skewness(constructor): np.nan, np.nan, ] - ) + )[::step] tm.assert_equal(result, expected) @@ -247,7 +247,7 @@ def test_rolling_forward_cov_corr(func, expected): # We are interested in checking only pairwise covariance / correlation result = getattr(rolling, func)().loc[(slice(None), 1), 0] result = result.reset_index(drop=True) - expected = Series(expected) + expected = Series(expected).reset_index(drop=True) expected.name = result.name tm.assert_equal(result, expected) @@ -259,22 +259,22 @@ def test_rolling_forward_cov_corr(func, expected): ["left", [0.0, 0.0, 1.0, 2.0, 5.0, 9.0, 5.0, 6.0, 7.0, 8.0]], ], ) -def test_non_fixed_variable_window_indexer(closed, expected_data): +def test_non_fixed_variable_window_indexer(closed, expected_data, step): index = date_range("2020", periods=10) df = DataFrame(range(10), index=index) offset = BusinessDay(1) indexer = VariableOffsetWindowIndexer(index=index, offset=offset) - result = df.rolling(indexer, closed=closed).sum() - expected = DataFrame(expected_data, index=index) + result = df.rolling(indexer, closed=closed, step=step).sum() + expected = DataFrame(expected_data, index=index)[::step] tm.assert_frame_equal(result, expected) -def test_fixed_forward_indexer_count(): +def test_fixed_forward_indexer_count(step): # GH: 35579 df = DataFrame({"b": [None, None, None, 7]}) indexer = FixedForwardWindowIndexer(window_size=2) - result = df.rolling(window=indexer, min_periods=0).count() - expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]}) + result = df.rolling(window=indexer, min_periods=0, step=step).count() + expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]})[::step] tm.assert_frame_equal(result, expected) @@ -346,14 +346,16 @@ def test_indexers_are_reusable_after_groupby_rolling( ], ) def test_fixed_forward_indexer_bounds( - window_size, num_values, expected_start, expected_end + window_size, num_values, expected_start, expected_end, step ): # GH 43267 indexer = FixedForwardWindowIndexer(window_size=window_size) - start, end = indexer.get_window_bounds(num_values=num_values) + start, end = indexer.get_window_bounds(num_values=num_values, step=step) - tm.assert_numpy_array_equal(start, np.array(expected_start), check_dtype=False) - tm.assert_numpy_array_equal(end, np.array(expected_end), check_dtype=False) + tm.assert_numpy_array_equal( + start, np.array(expected_start[::step]), check_dtype=False + ) + tm.assert_numpy_array_equal(end, np.array(expected_end[::step]), check_dtype=False) assert len(start) == len(end) diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 80a96c3a8cee9..161976a6112a5 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -90,9 +90,11 @@ def dtypes(request): ), ], ) -def test_series_dtypes(method, data, expected_data, coerce_int, dtypes, min_periods): +def test_series_dtypes( + method, data, expected_data, coerce_int, dtypes, min_periods, step +): ser = Series(data, dtype=get_dtype(dtypes, coerce_int=coerce_int)) - rolled = ser.rolling(2, min_periods=min_periods) + rolled = ser.rolling(2, min_periods=min_periods, step=step) if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count": msg = "No numeric types to aggregate" @@ -100,15 +102,15 @@ def test_series_dtypes(method, data, expected_data, coerce_int, dtypes, min_peri getattr(rolled, method)() else: result = getattr(rolled, method)() - expected = Series(expected_data, dtype="float64") + expected = Series(expected_data, dtype="float64")[::step] tm.assert_almost_equal(result, expected) -def test_series_nullable_int(any_signed_int_ea_dtype): +def test_series_nullable_int(any_signed_int_ea_dtype, step): # GH 43016 ser = Series([0, 1, NA], dtype=any_signed_int_ea_dtype) - result = ser.rolling(2).mean() - expected = Series([np.nan, 0.5, np.nan]) + result = ser.rolling(2, step=step).mean() + expected = Series([np.nan, 0.5, np.nan])[::step] tm.assert_series_equal(result, expected) @@ -156,10 +158,10 @@ def test_series_nullable_int(any_signed_int_ea_dtype): ), ], ) -def test_dataframe_dtypes(method, expected_data, dtypes, min_periods): +def test_dataframe_dtypes(method, expected_data, dtypes, min_periods, step): df = DataFrame(np.arange(10).reshape((5, 2)), dtype=get_dtype(dtypes)) - rolled = df.rolling(2, min_periods=min_periods) + rolled = df.rolling(2, min_periods=min_periods, step=step) if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count": msg = "No numeric types to aggregate" @@ -167,5 +169,5 @@ def test_dataframe_dtypes(method, expected_data, dtypes, min_periods): getattr(rolled, method)() else: result = getattr(rolled, method)() - expected = DataFrame(expected_data, dtype="float64") + expected = DataFrame(expected_data, dtype="float64")[::step] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 2c9ae3d70f218..6a8ac60253654 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -57,7 +57,7 @@ def arithmetic_numba_supported_operators(request): # Filter warnings when parallel=True and the function can't be parallelized by Numba class TestEngine: @pytest.mark.parametrize("jit", [True, False]) - def test_numba_vs_cython_apply(self, jit, nogil, parallel, nopython, center): + def test_numba_vs_cython_apply(self, jit, nogil, parallel, nopython, center, step): def f(x, *args): arg_sum = 0 for arg in args: @@ -73,10 +73,10 @@ def f(x, *args): args = (2,) s = Series(range(10)) - result = s.rolling(2, center=center).apply( + result = s.rolling(2, center=center, step=step).apply( f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True ) - expected = s.rolling(2, center=center).apply( + expected = s.rolling(2, center=center, step=step).apply( f, engine="cython", args=args, raw=True ) tm.assert_series_equal(result, expected) @@ -85,14 +85,20 @@ def f(x, *args): "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")] ) def test_numba_vs_cython_rolling_methods( - self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators + self, + data, + nogil, + parallel, + nopython, + arithmetic_numba_supported_operators, + step, ): method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - roll = data.rolling(2) + roll = data.rolling(2, step=step) result = getattr(roll, method)( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) @@ -135,7 +141,7 @@ def test_numba_vs_cython_expanding_methods( tm.assert_equal(result, expected) @pytest.mark.parametrize("jit", [True, False]) - def test_cache_apply(self, jit, nogil, parallel, nopython): + def test_cache_apply(self, jit, nogil, parallel, nopython, step): # Test that the functions are cached correctly if we switch functions def func_1(x): return np.mean(x) + 4 @@ -151,7 +157,7 @@ def func_2(x): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - roll = Series(range(10)).rolling(2) + roll = Series(range(10)).rolling(2, step=step) result = roll.apply( func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True ) @@ -323,21 +329,29 @@ def f(x): ) def test_table_method_rolling_methods( - self, axis, nogil, parallel, nopython, arithmetic_numba_supported_operators + self, + axis, + nogil, + parallel, + nopython, + arithmetic_numba_supported_operators, + step, ): method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - roll_table = df.rolling(2, method="table", axis=axis, min_periods=0) + roll_table = df.rolling(2, method="table", axis=axis, min_periods=0, step=step) if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): getattr(roll_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) else: - roll_single = df.rolling(2, method="single", axis=axis, min_periods=0) + roll_single = df.rolling( + 2, method="single", axis=axis, min_periods=0, step=step + ) result = getattr(roll_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) @@ -346,29 +360,29 @@ def test_table_method_rolling_methods( ) tm.assert_frame_equal(result, expected) - def test_table_method_rolling_apply(self, axis, nogil, parallel, nopython): + def test_table_method_rolling_apply(self, axis, nogil, parallel, nopython, step): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} def f(x): return np.sum(x, axis=0) + 1 df = DataFrame(np.eye(3)) - result = df.rolling(2, method="table", axis=axis, min_periods=0).apply( - f, raw=True, engine_kwargs=engine_kwargs, engine="numba" - ) - expected = df.rolling(2, method="single", axis=axis, min_periods=0).apply( - f, raw=True, engine_kwargs=engine_kwargs, engine="numba" - ) + result = df.rolling( + 2, method="table", axis=axis, min_periods=0, step=step + ).apply(f, raw=True, engine_kwargs=engine_kwargs, engine="numba") + expected = df.rolling( + 2, method="single", axis=axis, min_periods=0, step=step + ).apply(f, raw=True, engine_kwargs=engine_kwargs, engine="numba") tm.assert_frame_equal(result, expected) - def test_table_method_rolling_weighted_mean(self): + def test_table_method_rolling_weighted_mean(self, step): def weighted_mean(x): arr = np.ones((1, x.shape[1])) arr[:, :2] = (x[:, :2] * x[:, 2]).sum(axis=0) / x[:, 2].sum() return arr df = DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - result = df.rolling(2, method="table", min_periods=0).apply( + result = df.rolling(2, method="table", min_periods=0, step=step).apply( weighted_mean, raw=True, engine="numba" ) expected = DataFrame( @@ -378,7 +392,7 @@ def weighted_mean(x): [3.333333, 2.333333, 1.0], [1.555556, 7, 1.0], ] - ) + )[::step] tm.assert_frame_equal(result, expected) def test_table_method_expanding_apply(self, axis, nogil, parallel, nopython): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4f68538f6905e..4e91a133ffca2 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -82,7 +82,7 @@ def test_invalid_constructor(frame_or_series, w): @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3)]) -def test_constructor_with_timedelta_window(window): +def test_constructor_with_timedelta_window(window, step): # GH 15440 n = 10 df = DataFrame( @@ -91,18 +91,18 @@ def test_constructor_with_timedelta_window(window): ) expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) - result = df.rolling(window=window).sum() + result = df.rolling(window=window, step=step).sum() expected = DataFrame( {"value": expected_data}, index=date_range("2015-12-24", periods=n, freq="D"), - ) + )[::step] tm.assert_frame_equal(result, expected) - expected = df.rolling("3D").sum() + expected = df.rolling("3D", step=step).sum() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3), "3D"]) -def test_constructor_timedelta_window_and_minperiods(window, raw): +def test_constructor_timedelta_window_and_minperiods(window, step, raw): # GH 15305 n = 10 df = DataFrame( @@ -112,9 +112,11 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): expected = DataFrame( {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, index=date_range("2017-08-08", periods=n, freq="D"), + )[::step] + result_roll_sum = df.rolling(window=window, min_periods=2, step=step).sum() + result_roll_generic = df.rolling(window=window, min_periods=2, step=step).apply( + sum, raw=raw ) - result_roll_sum = df.rolling(window=window, min_periods=2).sum() - result_roll_generic = df.rolling(window=window, min_periods=2).apply(sum, raw=raw) tm.assert_frame_equal(result_roll_sum, expected) tm.assert_frame_equal(result_roll_generic, expected) @@ -133,18 +135,20 @@ def test_numpy_compat(method): @pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) -def test_closed_fixed(closed, arithmetic_win_operators): +def test_closed_fixed(closed, arithmetic_win_operators, step): # GH 34315 func_name = arithmetic_win_operators df_fixed = DataFrame({"A": [0, 1, 2, 3, 4]}) df_time = DataFrame({"A": [0, 1, 2, 3, 4]}, index=date_range("2020", periods=5)) result = getattr( - df_fixed.rolling(2, closed=closed, min_periods=1), + df_fixed.rolling(2, closed=closed, min_periods=1, step=step), func_name, )() + if step is not None: + result = result.reset_index(drop=True) expected = getattr( - df_time.rolling("2D", closed=closed, min_periods=1), + df_time.rolling("2D", closed=closed, min_periods=1, step=step), func_name, )().reset_index(drop=True) @@ -197,7 +201,7 @@ def test_closed_fixed(closed, arithmetic_win_operators): ], ) def test_datetimelike_centered_selections( - closed, window_selections, arithmetic_win_operators + closed, window_selections, step, arithmetic_win_operators ): # GH 34315 func_name = arithmetic_win_operators @@ -208,7 +212,7 @@ def test_datetimelike_centered_selections( expected = DataFrame( {"A": [getattr(df_time["A"].iloc[s], func_name)() for s in window_selections]}, index=date_range("2020", periods=5), - ) + )[::step] if func_name == "sem": kwargs = {"ddof": 0} @@ -216,7 +220,7 @@ def test_datetimelike_centered_selections( kwargs = {} result = getattr( - df_time.rolling("2D", closed=closed, min_periods=1, center=True), + df_time.rolling("2D", closed=closed, min_periods=1, center=True, step=step), func_name, )(**kwargs) @@ -237,7 +241,7 @@ def test_datetimelike_centered_selections( ], ) def test_datetimelike_centered_offset_covers_all( - window, closed, expected, frame_or_series + window, closed, expected, step, frame_or_series ): # GH 42753 @@ -248,8 +252,8 @@ def test_datetimelike_centered_offset_covers_all( ] df = frame_or_series([1, 1, 1], index=index) - result = df.rolling(window, closed=closed, center=True).sum() - expected = frame_or_series(expected, index=index) + result = df.rolling(window, closed=closed, center=True, step=step).sum() + expected = frame_or_series(expected, index=index)[::step] tm.assert_equal(result, expected) @@ -263,7 +267,7 @@ def test_datetimelike_centered_offset_covers_all( ], ) def test_datetimelike_nonunique_index_centering( - window, closed, expected, frame_or_series + window, closed, expected, frame_or_series, step ): index = DatetimeIndex( [ @@ -279,28 +283,28 @@ def test_datetimelike_nonunique_index_centering( ) df = frame_or_series([1] * 8, index=index, dtype=float) - expected = frame_or_series(expected, index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float)[::step] - result = df.rolling(window, center=True, closed=closed).sum() + result = df.rolling(window, center=True, closed=closed, step=step).sum() tm.assert_equal(result, expected) -def test_even_number_window_alignment(): +def test_even_number_window_alignment(step): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) # behavior of index- and datetime-based windows differs here! # s.rolling(window=2, min_periods=1, center=True).mean() - result = s.rolling(window="2D", min_periods=1, center=True).mean() + result = s.rolling(window="2D", min_periods=1, center=True, step=step).mean() - expected = Series([0.5, 1.5, 2], index=s.index) + expected = Series([0.5, 1.5, 2], index=s.index)[::step] tm.assert_series_equal(result, expected) -def test_closed_fixed_binary_col(center): +def test_closed_fixed_binary_col(center, step): # GH 34315 data = [0, 1, 1, 0, 0, 1, 0, 1] df = DataFrame( @@ -317,31 +321,35 @@ def test_closed_fixed_binary_col(center): expected_data, columns=["binary_col"], index=date_range(start="2020-01-01", freq="min", periods=len(expected_data)), - ) + )[::step] - rolling = df.rolling(window=len(df), closed="left", min_periods=1, center=center) + rolling = df.rolling( + window=len(df), closed="left", min_periods=1, center=center, step=step + ) result = rolling.mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("closed", ["neither", "left"]) -def test_closed_empty(closed, arithmetic_win_operators): +def test_closed_empty(closed, arithmetic_win_operators, step): # GH 26005 func_name = arithmetic_win_operators ser = Series(data=np.arange(5), index=date_range("2000", periods=5, freq="2D")) - roll = ser.rolling("1D", closed=closed) + roll = ser.rolling("1D", closed=closed, step=step) result = getattr(roll, func_name)() - expected = Series([np.nan] * 5, index=ser.index) + expected = Series([np.nan] * 5, index=ser.index)[::step] tm.assert_series_equal(result, expected) @pytest.mark.parametrize("func", ["min", "max"]) -def test_closed_one_entry(func): +def test_closed_one_entry(func, step): # GH24718 ser = Series(data=[2], index=date_range("2000", periods=1)) - result = getattr(ser.rolling("10D", closed="left"), func)() - tm.assert_series_equal(result, Series([np.nan], index=ser.index)) + result = getattr(ser.rolling("10D", closed="left", step=step), func)() + index = ser.index.copy() + index.freq = index.freq * (step or 1) + tm.assert_series_equal(result, Series([np.nan], index=index)) @pytest.mark.parametrize("func", ["min", "max"]) diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index 842c056806092..9ab4ff13796d6 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -38,10 +38,11 @@ [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}], ], ) -def test_series(series, compare_func, roll_func, kwargs): - result = getattr(series.rolling(50), roll_func)(**kwargs) +def test_series(series, compare_func, roll_func, kwargs, step): + result = getattr(series.rolling(50, step=step), roll_func)(**kwargs) assert isinstance(result, Series) - tm.assert_almost_equal(result.iloc[-1], compare_func(series[-50:])) + end = range(0, len(series), step or 1)[-1] + 1 + tm.assert_almost_equal(result.iloc[-1], compare_func(series[end - 50 : end])) @pytest.mark.parametrize( @@ -64,12 +65,13 @@ def test_series(series, compare_func, roll_func, kwargs): [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}], ], ) -def test_frame(raw, frame, compare_func, roll_func, kwargs): - result = getattr(frame.rolling(50), roll_func)(**kwargs) +def test_frame(raw, frame, compare_func, roll_func, kwargs, step): + result = getattr(frame.rolling(50, step=step), roll_func)(**kwargs) assert isinstance(result, DataFrame) + end = range(0, len(frame), step or 1)[-1] + 1 tm.assert_series_equal( result.iloc[-1, :], - frame.iloc[-50:, :].apply(compare_func, axis=0, raw=raw), + frame.iloc[end - 50 : end, :].apply(compare_func, axis=0, raw=raw), check_names=False, ) @@ -200,13 +202,13 @@ def test_nans_count(): ], ) @pytest.mark.parametrize("minp", [0, 99, 100]) -def test_min_periods(series, minp, roll_func, kwargs): - result = getattr(series.rolling(len(series) + 1, min_periods=minp), roll_func)( - **kwargs - ) - expected = getattr(series.rolling(len(series), min_periods=minp), roll_func)( - **kwargs - ) +def test_min_periods(series, minp, roll_func, kwargs, step): + result = getattr( + series.rolling(len(series) + 1, min_periods=minp, step=step), roll_func + )(**kwargs) + expected = getattr( + series.rolling(len(series), min_periods=minp, step=step), roll_func + )(**kwargs) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) @@ -214,9 +216,9 @@ def test_min_periods(series, minp, roll_func, kwargs): tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) -def test_min_periods_count(series): - result = series.rolling(len(series) + 1, min_periods=0).count() - expected = series.rolling(len(series), min_periods=0).count() +def test_min_periods_count(series, step): + result = series.rolling(len(series) + 1, min_periods=0, step=step).count() + expected = series.rolling(len(series), min_periods=0, step=step).count() nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) @@ -364,7 +366,7 @@ def test_rolling_functions_window_non_shrinkage(f): tm.assert_frame_equal(df_result, df_expected) -def test_rolling_max_gh6297(): +def test_rolling_max_gh6297(step): """Replicate result expected in GH #6297""" indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 2 datapoints on one of the days @@ -378,12 +380,12 @@ def test_rolling_max_gh6297(): expected = Series( [1.0, 2.0, 6.0, 4.0, 5.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), - ) - x = series.resample("D").max().rolling(window=1).max() + )[::step] + x = series.resample("D").max().rolling(window=1, step=step).max() tm.assert_series_equal(expected, x) -def test_rolling_max_resample(): +def test_rolling_max_resample(step): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) @@ -399,16 +401,16 @@ def test_rolling_max_resample(): expected = Series( [0.0, 1.0, 2.0, 3.0, 20.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), - ) - x = series.resample("D").max().rolling(window=1).max() + )[::step] + x = series.resample("D").max().rolling(window=1, step=step).max() tm.assert_series_equal(expected, x) # Now specify median (10.0) expected = Series( [0.0, 1.0, 2.0, 3.0, 10.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), - ) - x = series.resample("D").median().rolling(window=1).max() + )[::step] + x = series.resample("D").median().rolling(window=1, step=step).max() tm.assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 @@ -416,12 +418,12 @@ def test_rolling_max_resample(): expected = Series( [0.0, 1.0, 2.0, 3.0, v], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), - ) - x = series.resample("D").mean().rolling(window=1).max() + )[::step] + x = series.resample("D").mean().rolling(window=1, step=step).max() tm.assert_series_equal(expected, x) -def test_rolling_min_resample(): +def test_rolling_min_resample(step): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) @@ -437,8 +439,8 @@ def test_rolling_min_resample(): expected = Series( [0.0, 1.0, 2.0, 3.0, 4.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), - ) - r = series.resample("D").min().rolling(window=1) + )[::step] + r = series.resample("D").min().rolling(window=1, step=step) tm.assert_series_equal(expected, r.min()) diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index 56b79097a1d05..815ee419590f7 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -34,21 +34,23 @@ def scoreatpercentile(a, per): @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) -def test_series(series, q): +def test_series(series, q, step): compare_func = partial(scoreatpercentile, per=q) - result = series.rolling(50).quantile(q) + result = series.rolling(50, step=step).quantile(q) assert isinstance(result, Series) - tm.assert_almost_equal(result.iloc[-1], compare_func(series[-50:])) + end = range(0, len(series), step or 1)[-1] + 1 + tm.assert_almost_equal(result.iloc[-1], compare_func(series[end - 50 : end])) @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) -def test_frame(raw, frame, q): +def test_frame(raw, frame, q, step): compare_func = partial(scoreatpercentile, per=q) - result = frame.rolling(50).quantile(q) + result = frame.rolling(50, step=step).quantile(q) assert isinstance(result, DataFrame) + end = range(0, len(frame), step or 1)[-1] + 1 tm.assert_series_equal( result.iloc[-1, :], - frame.iloc[-50:, :].apply(compare_func, axis=0, raw=raw), + frame.iloc[end - 50 : end, :].apply(compare_func, axis=0, raw=raw), check_names=False, ) @@ -113,9 +115,9 @@ def test_nans(q): @pytest.mark.parametrize("minp", [0, 99, 100]) @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) -def test_min_periods(series, minp, q): - result = series.rolling(len(series) + 1, min_periods=minp).quantile(q) - expected = series.rolling(len(series), min_periods=minp).quantile(q) +def test_min_periods(series, minp, q, step): + result = series.rolling(len(series) + 1, min_periods=minp, step=step).quantile(q) + expected = series.rolling(len(series), min_periods=minp, step=step).quantile(q) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 46b7eb6cbc285..152172d7b2266 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -112,9 +112,13 @@ def test_nans(sp_func, roll_func): @pytest.mark.parametrize("minp", [0, 99, 100]) @pytest.mark.parametrize("roll_func", ["kurt", "skew"]) -def test_min_periods(series, minp, roll_func): - result = getattr(series.rolling(len(series) + 1, min_periods=minp), roll_func)() - expected = getattr(series.rolling(len(series), min_periods=minp), roll_func)() +def test_min_periods(series, minp, roll_func, step): + result = getattr( + series.rolling(len(series) + 1, min_periods=minp, step=step), roll_func + )() + expected = getattr( + series.rolling(len(series), min_periods=minp, step=step), roll_func + )() nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) @@ -172,55 +176,55 @@ def test_center_reindex_frame(frame, roll_func): tm.assert_frame_equal(frame_xp, frame_rs) -def test_rolling_skew_edge_cases(): +def test_rolling_skew_edge_cases(step): - all_nan = Series([np.NaN] * 5) + all_nan = Series([np.NaN] * 5)[::step] # yields all NaN (0 variance) d = Series([1] * 5) - x = d.rolling(window=5).skew() + x = d.rolling(window=5, step=step).skew() tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) - x = d.rolling(window=2).skew() + x = d.rolling(window=2, step=step).skew() tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) - x = d.rolling(window=4).skew() + expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824])[::step] + x = d.rolling(window=4, step=step).skew() tm.assert_series_equal(expected, x) -def test_rolling_kurt_edge_cases(): +def test_rolling_kurt_edge_cases(step): - all_nan = Series([np.NaN] * 5) + all_nan = Series([np.NaN] * 5)[::step] # yields all NaN (0 variance) d = Series([1] * 5) - x = d.rolling(window=5).kurt() + x = d.rolling(window=5, step=step).kurt() tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) - x = d.rolling(window=3).kurt() + x = d.rolling(window=3, step=step).kurt() tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) - x = d.rolling(window=4).kurt() + expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499])[::step] + x = d.rolling(window=4, step=step).kurt() tm.assert_series_equal(expected, x) -def test_rolling_skew_eq_value_fperr(): +def test_rolling_skew_eq_value_fperr(step): # #18804 all rolling skew for all equal values should return Nan - a = Series([1.1] * 15).rolling(window=10).skew() + a = Series([1.1] * 15).rolling(window=10, step=step).skew() assert np.isnan(a).all() -def test_rolling_kurt_eq_value_fperr(): +def test_rolling_kurt_eq_value_fperr(step): # #18804 all rolling kurt for all equal values should return Nan - a = Series([1.1] * 15).rolling(window=10).kurt() + a = Series([1.1] * 15).rolling(window=10, step=step).kurt() assert np.isnan(a).all() diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index adb3fffbd5750..c356c9bdc7742 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -125,10 +125,10 @@ def test_constructor_with_win_type_invalid(frame_or_series): @td.skip_if_no_scipy @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") -def test_window_with_args(): +def test_window_with_args(step): # make sure that we are aggregating window functions correctly with arg r = Series(np.random.randn(100)).rolling( - window=10, min_periods=1, win_type="gaussian" + window=10, min_periods=1, win_type="gaussian", step=step ) expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) expected.columns = ["", ""] @@ -183,10 +183,10 @@ def get_window_bounds(self, num_values, min_periods, center, closed, step): @td.skip_if_no_scipy -def test_cmov_mean(): +def test_cmov_mean(step): # GH 8238 vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) - result = Series(vals).rolling(5, center=True).mean() + result = Series(vals).rolling(5, center=True, step=step).mean() expected_values = [ np.nan, np.nan, @@ -199,15 +199,15 @@ def test_cmov_mean(): np.nan, np.nan, ] - expected = Series(expected_values) + expected = Series(expected_values)[::step] tm.assert_series_equal(expected, result) @td.skip_if_no_scipy -def test_cmov_window(): +def test_cmov_window(step): # GH 8238 vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) - result = Series(vals).rolling(5, win_type="boxcar", center=True).mean() + result = Series(vals).rolling(5, win_type="boxcar", center=True, step=step).mean() expected_values = [ np.nan, np.nan, @@ -220,28 +220,28 @@ def test_cmov_window(): np.nan, np.nan, ] - expected = Series(expected_values) + expected = Series(expected_values)[::step] tm.assert_series_equal(expected, result) @td.skip_if_no_scipy -def test_cmov_window_corner(): +def test_cmov_window_corner(step): # GH 8238 # all nan vals = Series([np.nan] * 10) - result = vals.rolling(5, center=True, win_type="boxcar").mean() + result = vals.rolling(5, center=True, win_type="boxcar", step=step).mean() assert np.isnan(result).all() # empty vals = Series([], dtype=object) - result = vals.rolling(5, center=True, win_type="boxcar").mean() + result = vals.rolling(5, center=True, win_type="boxcar", step=step).mean() assert len(result) == 0 # shorter than window vals = Series(np.random.randn(5)) - result = vals.rolling(10, win_type="boxcar").mean() + result = vals.rolling(10, win_type="boxcar", step=step).mean() assert np.isnan(result).all() - assert len(result) == 5 + assert len(result) == len(range(0, 5, step or 1)) @td.skip_if_no_scipy @@ -310,7 +310,7 @@ def test_cmov_window_corner(): ), ], ) -def test_cmov_window_frame(f, xp): +def test_cmov_window_frame(f, xp, step): # Gh 8238 df = DataFrame( np.array( @@ -328,28 +328,30 @@ def test_cmov_window_frame(f, xp): ] ) ) - xp = DataFrame(np.array(xp)) + xp = DataFrame(np.array(xp))[::step] - roll = df.rolling(5, win_type="boxcar", center=True) + roll = df.rolling(5, win_type="boxcar", center=True, step=step) rs = getattr(roll, f)() tm.assert_frame_equal(xp, rs) @td.skip_if_no_scipy -def test_cmov_window_na_min_periods(): +def test_cmov_window_na_min_periods(step): # min_periods vals = Series(np.random.randn(10)) vals[4] = np.nan vals[8] = np.nan - xp = vals.rolling(5, min_periods=4, center=True).mean() - rs = vals.rolling(5, win_type="boxcar", min_periods=4, center=True).mean() + xp = vals.rolling(5, min_periods=4, center=True, step=step).mean() + rs = vals.rolling( + 5, win_type="boxcar", min_periods=4, center=True, step=step + ).mean() tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy -def test_cmov_window_regular(win_types): +def test_cmov_window_regular(win_types, step): # GH 8238 vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) xps = { @@ -451,26 +453,26 @@ def test_cmov_window_regular(win_types): ], } - xp = Series(xps[win_types]) - rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() + xp = Series(xps[win_types])[::step] + rs = Series(vals).rolling(5, win_type=win_types, center=True, step=step).mean() tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy -def test_cmov_window_regular_linear_range(win_types): +def test_cmov_window_regular_linear_range(win_types, step): # GH 8238 vals = np.array(range(10), dtype=float) xp = vals.copy() xp[:2] = np.nan xp[-2:] = np.nan - xp = Series(xp) + xp = Series(xp)[::step] - rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() + rs = Series(vals).rolling(5, win_type=win_types, center=True, step=step).mean() tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy -def test_cmov_window_regular_missing_data(win_types): +def test_cmov_window_regular_missing_data(win_types, step): # GH 8238 vals = np.array( [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48] @@ -574,13 +576,13 @@ def test_cmov_window_regular_missing_data(win_types): ], } - xp = Series(xps[win_types]) - rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean() + xp = Series(xps[win_types])[::step] + rs = Series(vals).rolling(5, win_type=win_types, min_periods=3, step=step).mean() tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy -def test_cmov_window_special(win_types_special): +def test_cmov_window_special(win_types_special, step): # GH 8238 kwds = { "kaiser": {"beta": 1.0}, @@ -642,17 +644,17 @@ def test_cmov_window_special(win_types_special): ], } - xp = Series(xps[win_types_special]) + xp = Series(xps[win_types_special])[::step] rs = ( Series(vals) - .rolling(5, win_type=win_types_special, center=True) + .rolling(5, win_type=win_types_special, center=True, step=step) .mean(**kwds[win_types_special]) ) tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy -def test_cmov_window_special_linear_range(win_types_special): +def test_cmov_window_special_linear_range(win_types_special, step): # GH 8238 kwds = { "kaiser": {"beta": 1.0}, @@ -666,11 +668,11 @@ def test_cmov_window_special_linear_range(win_types_special): xp = vals.copy() xp[:2] = np.nan xp[-2:] = np.nan - xp = Series(xp) + xp = Series(xp)[::step] rs = ( Series(vals) - .rolling(5, win_type=win_types_special, center=True) + .rolling(5, win_type=win_types_special, center=True, step=step) .mean(**kwds[win_types_special]) ) tm.assert_series_equal(xp, rs) From a1d6954f36a92b5ba37c08818c764c8160188060 Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Wed, 16 Feb 2022 10:55:42 -0500 Subject: [PATCH 4/7] Request fixes and step-not-implemented testing --- pandas/_libs/window/indexers.pyx | 4 +- pandas/core/indexers/objects.py | 46 +++++++++++++--------- pandas/core/window/rolling.py | 50 ++++++++++++++++++------ pandas/tests/window/conftest.py | 19 --------- pandas/tests/window/test_apply.py | 5 +++ pandas/tests/window/test_base_indexer.py | 3 ++ pandas/tests/window/test_rolling.py | 10 +++++ pandas/util/_test_decorators.py | 40 +++++++++++++++++++ 8 files changed, 124 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 736c15d900185..d04d2fb19439a 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -47,12 +47,12 @@ def calculate_variable_window_bounds( Returns ------- - (ndarray[int64], ndarray[int64], ndarray[int64]) + (ndarray[int64], ndarray[int64]) """ cdef: bint left_closed = False bint right_closed = False - ndarray[int64_t, ndim=1] start, end, ref + ndarray[int64_t, ndim=1] start, end int64_t start_bound, end_bound, index_growth_sign = 1 Py_ssize_t i, j diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index d1a7e200bf307..3fdecae1c10c0 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -34,10 +34,8 @@ Returns ------- -A tuple of ndarray[int64]s: -start : array of start boundaries -end : array of end boundaries -ref : array of window reference locations, or None indicating all if step is None or 1 +A tuple of ndarray[int64]s, indicating the boundaries of each +window """ @@ -113,9 +111,12 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: int | None = None, + step: None = None, ) -> tuple[np.ndarray, np.ndarray]: + if step is not None: + raise NotImplementedError("step not implemented for variable window") + # error: Argument 4 to "calculate_variable_window_bounds" has incompatible # type "Optional[bool]"; expected "bool" # error: Argument 6 to "calculate_variable_window_bounds" has incompatible @@ -126,7 +127,7 @@ def get_window_bounds( min_periods, center, # type: ignore[arg-type] closed, - step if step is not None else 1, + 1, self.index_array, # type: ignore[arg-type] ) @@ -153,9 +154,12 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: int | None = None, + step: None = None, ) -> tuple[np.ndarray, np.ndarray]: + if step is not None: + raise NotImplementedError("step not implemented for variable offset window") + # if windows is variable, default is 'right', otherwise default is 'both' if closed is None: closed = "right" if self.index is not None else "both" @@ -211,7 +215,7 @@ def get_window_bounds( if not right_closed: end[i] -= 1 - return start[::step], end[::step] + return start, end class ExpandingIndexer(BaseIndexer): @@ -224,14 +228,15 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: int | None = None, + step: None = None, ) -> tuple[np.ndarray, np.ndarray]: - if step is None: - step = 1 - end = np.arange(1, num_values + 1, step, dtype=np.int64) + if step is not None: + raise NotImplementedError("step not implemented for expanding window") + + end = np.arange(1, num_values + 1, dtype=np.int64) start = np.zeros(len(end), dtype=np.int64) - return start[::step], end[::step] + return start, end class FixedForwardWindowIndexer(BaseIndexer): @@ -333,10 +338,11 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: int | None = None, + step: None = None, ) -> tuple[np.ndarray, np.ndarray]: - if step not in [None, 1]: - raise NotImplementedError(f"unsupported step: {step}") + if step is not None: + raise NotImplementedError("step not implemented for groupby window") + # 1) For each group, get the indices that belong to the group # 2) Use the indices to calculate the start & end bounds of the window # 3) Append the window bounds in group order @@ -390,11 +396,13 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: int | None = None, + step: None = None, ) -> tuple[np.ndarray, np.ndarray]: - if step not in [None, 1]: - raise NotImplementedError(f"unsupported step: {step}") + if step is not None: + raise NotImplementedError( + "step not implemented for exponentail moving window" + ) return ( np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fdca3f88e9128..e5ade391db552 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -243,9 +243,19 @@ def _check_window_bounds( raise ValueError( f"start and end bounds ({len(start)}) must be the same length " f"as the object ({num_vals}) divided by the step ({self.step}) " - f"if given and rounded up unless groupby was used" + f"if given and rounded up" ) + def _slice_index(self, index: Index, result: int | None = None) -> Index: + """ + Slices the index for a given result. + """ + return ( + index + if result is None or len(result) == len(index) + else index[:: self.step] + ) + def _create_data(self, obj: NDFrameT) -> NDFrameT: """ Split data into blocks & return conformed data. @@ -435,7 +445,7 @@ def _apply_series( raise DataError("No numeric types to aggregate") from err result = homogeneous_func(values) - index = obj.index if len(result) == len(obj.index) else obj.index[:: self.step] + index = self._slice_index(obj.index, result) return obj._constructor(result, index=index, name=obj.name) def _apply_blockwise( @@ -467,16 +477,17 @@ def hfunc(values: ArrayLike) -> ArrayLike: # GH#42736 operate column-wise instead of block-wise try: res = hfunc(arr) - except (TypeError, NotImplementedError): + except TypeError: pass + except NotImplementedError as err: + if "step not implemented" in str(err): + raise else: res_values.append(res) taker.append(i) - index = ( - obj.index - if len(res_values) == 0 or len(res_values[0]) == len(obj.index) - else obj.index[:: self.step] + index = self._slice_index( + obj.index, res_values[0] if len(res_values) > 0 else None ) df = type(obj)._from_arrays( res_values, @@ -515,7 +526,7 @@ def _apply_tablewise( values = values.T if self.axis == 1 else values result = homogeneous_func(values) result = result.T if self.axis == 1 else result - index = obj.index if len(result) == len(obj.index) else obj.index[:: self.step] + index = self._slice_index(obj.index, result) columns = ( obj.columns if result.shape[1] == len(obj.columns) @@ -643,7 +654,7 @@ def _numba_apply( result = aggregator(values, start, end, min_periods, *func_args) NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator result = result.T if self.axis == 1 else result - index = obj.index if len(result) == len(obj.index) else obj.index[:: self.step] + index = self._slice_index(obj.index, result) if obj.ndim == 1: result = result.squeeze() out = obj._constructor(result, index=index, name=obj.name) @@ -1606,8 +1617,10 @@ def cov( ddof: int = 1, **kwargs, ): - if self.step not in [None, 1]: - raise NotImplementedError(f"invalid step: {self.step}") + if self.step is not None: + raise NotImplementedError( + "step not implemented for rolling and expanding cov" + ) from pandas import Series @@ -1651,8 +1664,10 @@ def corr( **kwargs, ): - if self.step not in [None, 1]: - raise NotImplementedError(f"invalid step: {self.step}") + if self.step is not None: + raise NotImplementedError( + "step not implemented for rolling and expanding corr" + ) from pandas import Series @@ -2631,6 +2646,15 @@ class RollingGroupby(BaseWindowGroupby, Rolling): Provide a rolling groupby implementation. """ + def __init__( + self, + *args, + **kwargs, + ): + if kwargs.get("step") is not None: + raise NotImplementedError("step not implemented for rolling groupby") + super().__init__(*args, **kwargs) + _attributes = Rolling._attributes + BaseWindowGroupby._attributes def _get_window_indexer(self) -> GroupbyIndexer: diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 715e4bcf6feaf..f42a1a5449c5c 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -132,22 +132,3 @@ def frame(): def step(request): """step keyword argument for rolling window operations.""" return request.param - - -@pytest.fixture -def step_methods(): - """Make a step-argument helper as fixture.""" - - class StepMethods: - @staticmethod - def get_selected_indices(step, group_keys): - """Return step-selected indices within groups.""" - step = step or 1 - group_ind = {} - for i, key in enumerate(group_keys): - group_ind.setdefault(key, []).append(i) - return sorted( - ind[j] for ind in group_ind.values() for j in range(0, len(ind), step) - ) - - return StepMethods diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index d4f5fed0a5d3c..2cee3ea1c1ec5 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -52,6 +54,7 @@ def test_rolling_apply_out_of_bounds(engine_and_raw): @pytest.mark.parametrize("window", [2, "2s"]) +@td.step_not_implemented(when=lambda bargs: bargs.get("window") == "2s") def test_rolling_apply_with_pandas_objects(window, step): # 5071 df = DataFrame( @@ -95,6 +98,7 @@ def test_rolling_apply(engine_and_raw, step): tm.assert_series_equal(result, expected) +@td.step_not_implemented() def test_all_apply(engine_and_raw, step): engine, raw = engine_and_raw @@ -112,6 +116,7 @@ def test_all_apply(engine_and_raw, step): tm.assert_frame_equal(result, expected) +@td.step_not_implemented() def test_ragged_apply(engine_and_raw, step): engine, raw = engine_and_raw diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 76dede02f489d..a47a43845acfb 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, MultiIndex, @@ -259,6 +261,7 @@ def test_rolling_forward_cov_corr(func, expected): ["left", [0.0, 0.0, 1.0, 2.0, 5.0, 9.0, 5.0, 6.0, 7.0, 8.0]], ], ) +@td.step_not_implemented() def test_non_fixed_variable_window_indexer(closed, expected_data, step): index = date_range("2020", periods=10) df = DataFrame(range(10), index=index) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4e91a133ffca2..0374a1515d5f9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -11,6 +11,7 @@ is_platform_mac, ) from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -82,6 +83,7 @@ def test_invalid_constructor(frame_or_series, w): @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3)]) +@td.step_not_implemented() def test_constructor_with_timedelta_window(window, step): # GH 15440 n = 10 @@ -102,6 +104,7 @@ def test_constructor_with_timedelta_window(window, step): @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3), "3D"]) +@td.step_not_implemented() def test_constructor_timedelta_window_and_minperiods(window, step, raw): # GH 15305 n = 10 @@ -135,6 +138,7 @@ def test_numpy_compat(method): @pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) +@td.step_not_implemented() def test_closed_fixed(closed, arithmetic_win_operators, step): # GH 34315 func_name = arithmetic_win_operators @@ -200,6 +204,7 @@ def test_closed_fixed(closed, arithmetic_win_operators, step): ), ], ) +@td.step_not_implemented() def test_datetimelike_centered_selections( closed, window_selections, step, arithmetic_win_operators ): @@ -240,6 +245,7 @@ def test_datetimelike_centered_selections( ("2s", "neither", [1.0, 2.0, 2.0]), ], ) +@td.step_not_implemented() def test_datetimelike_centered_offset_covers_all( window, closed, expected, step, frame_or_series ): @@ -266,6 +272,7 @@ def test_datetimelike_centered_offset_covers_all( ("2D", "neither", [2, 2, 2, 2, 2, 2, 2, 2]), ], ) +@td.step_not_implemented() def test_datetimelike_nonunique_index_centering( window, closed, expected, frame_or_series, step ): @@ -290,6 +297,7 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) +@td.step_not_implemented() def test_even_number_window_alignment(step): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) @@ -331,6 +339,7 @@ def test_closed_fixed_binary_col(center, step): @pytest.mark.parametrize("closed", ["neither", "left"]) +@td.step_not_implemented() def test_closed_empty(closed, arithmetic_win_operators, step): # GH 26005 func_name = arithmetic_win_operators @@ -343,6 +352,7 @@ def test_closed_empty(closed, arithmetic_win_operators, step): @pytest.mark.parametrize("func", ["min", "max"]) +@td.step_not_implemented() def test_closed_one_entry(func, step): # GH24718 ser = Series(data=[2], index=date_range("2000", periods=1)) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 10322a25ffd18..7ef4a7766d835 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -26,6 +26,11 @@ def test_foo(): from __future__ import annotations from contextlib import contextmanager +from functools import wraps +from inspect import ( + Parameter, + signature, +) import locale from typing import Callable import warnings @@ -312,3 +317,38 @@ def mark_array_manager_not_yet_implemented(request): get_option("mode.data_manager") == "array", reason="Test that relies on BlockManager internals or specific behaviour", ) + + +def step_not_implemented(when=None): + """ + Decorator factory for test cases expecting "step not implemented" errors. + + Parameters + ---------- + when : Callable + a callable accepting a BoundArguments object, which contains the arguments + passed to the test method, and returning True when a "step not implemented" + error is expected, provided the "step" argument is not None. Defaults to + always True, meaning that the error is expected only when the step argument + is not None. + """ + + def decorate_step_not_implemented(f): + expected_kinds = [Parameter.POSITIONAL_ONLY, Parameter.POSITIONAL_OR_KEYWORD] + sig = signature(f) + prms = sig.parameters + if prms.get("step") is None or prms["step"].kind not in expected_kinds: + raise ValueError(f"missing positional step parameter in {f}") + + @wraps(f) + def wrap_step_not_implemented(*args, **kwargs): + bargs = sig.bind(*args, **kwargs).arguments + if bargs.get("step") is not None and (when is None or when(bargs)): + with pytest.raises(NotImplementedError, match="step not implemented"): + return f(*args, **kwargs) + else: + return f(*args, **kwargs) + + return wrap_step_not_implemented + + return decorate_step_not_implemented From 687e8a410daafa28e422ae795eff462ea71cc090 Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Wed, 16 Feb 2022 14:51:21 -0500 Subject: [PATCH 5/7] Fix typing --- pandas/core/indexers/objects.py | 10 +++++----- pandas/core/window/rolling.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 3fdecae1c10c0..5bcc0baaa2ac7 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -111,7 +111,7 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: None = None, + step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: if step is not None: @@ -154,7 +154,7 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: None = None, + step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: if step is not None: @@ -228,7 +228,7 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: None = None, + step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: if step is not None: @@ -338,7 +338,7 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: None = None, + step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: if step is not None: raise NotImplementedError("step not implemented for groupby window") @@ -396,7 +396,7 @@ def get_window_bounds( min_periods: int | None = None, center: bool | None = None, closed: str | None = None, - step: None = None, + step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: if step is not None: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e5ade391db552..079cbd99fc219 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,6 +14,7 @@ Any, Callable, Hashable, + Sized, ) import warnings @@ -246,7 +247,7 @@ def _check_window_bounds( f"if given and rounded up" ) - def _slice_index(self, index: Index, result: int | None = None) -> Index: + def _slice_index(self, index: Index, result: Sized | None = None) -> Index: """ Slices the index for a given result. """ From 527e0e0c2bf13d7a1eb8d016450073a50c937c21 Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Fri, 18 Feb 2022 05:55:37 -0500 Subject: [PATCH 6/7] More requested fixes --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/window/indexers.pyx | 3 ++ pandas/core/indexers/objects.py | 5 ++ pandas/core/window/common.py | 11 ++--- pandas/core/window/rolling.py | 61 ++++++++++++++++-------- pandas/tests/window/test_apply.py | 8 ++-- pandas/tests/window/test_base_indexer.py | 4 +- pandas/tests/window/test_rolling.py | 58 ++++++++++++++++++---- pandas/util/_test_decorators.py | 40 ---------------- 9 files changed, 105 insertions(+), 86 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 919ed926f8195..010788eda1a1e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -37,6 +37,7 @@ Other enhancements - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`) - :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`) - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) +- :meth:`DataFrame.rolling` and :meth:`Series.rolling` now support a ``step`` parameter with fixed-length windows (:issue:`15354`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index d04d2fb19439a..992212a872035 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -56,6 +56,9 @@ def calculate_variable_window_bounds( int64_t start_bound, end_bound, index_growth_sign = 1 Py_ssize_t i, j + if num_values <= 0: + return np.empty(0, dtype='int64'), np.empty(0, dtype='int64') + # default is 'right' if closed is None: closed = 'right' diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 5bcc0baaa2ac7..54bdee0bb0208 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -29,6 +29,7 @@ closed passed from the top level rolling API step : int, default None step passed from the top level rolling API + .. versionadded:: 1.5 win_type : str, default None win_type passed from the top level rolling API @@ -159,6 +160,8 @@ def get_window_bounds( if step is not None: raise NotImplementedError("step not implemented for variable offset window") + if num_values <= 0: + return np.empty(0, dtype="int64"), np.empty(0, dtype="int64") # if windows is variable, default is 'right', otherwise default is 'both' if closed is None: @@ -381,6 +384,8 @@ def get_window_bounds( ) start_arrays.append(window_indices.take(ensure_platform_int(start))) end_arrays.append(window_indices.take(ensure_platform_int(end))) + if len(start_arrays) == 0: + return np.array([], dtype=np.int64), np.array([], dtype=np.int64) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) return start, end diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 7035a67acaea2..15144116fa924 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -22,9 +22,7 @@ def flex_binary_moment(arg1, arg2, f, pairwise=False): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): - result = DataFrame( - data, index=None if len(data) > 0 else frame_template.index - ) + result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result @@ -44,16 +42,13 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg2' columns are not unique") X, Y = arg1.align(arg2, join="outer") X, Y = prep_binary(X, Y) - result_index = X.index res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) - result_index = results[col].index - return DataFrame(results, index=result_index, columns=res_columns) + return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) - result_index = arg1.index.union(arg2.index) for i in range(len(arg1.columns)): for j in range(len(arg2.columns)): if j < i and arg2 is arg1: @@ -63,10 +58,10 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = f( *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) ) - result_index = results[i][j].index from pandas import concat + result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 079cbd99fc219..5475febb637df 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -249,7 +249,7 @@ def _check_window_bounds( def _slice_index(self, index: Index, result: Sized | None = None) -> Index: """ - Slices the index for a given result. + Slices the index for a given result and the preset step. """ return ( index @@ -478,11 +478,8 @@ def hfunc(values: ArrayLike) -> ArrayLike: # GH#42736 operate column-wise instead of block-wise try: res = hfunc(arr) - except TypeError: + except (TypeError, NotImplementedError): pass - except NotImplementedError as err: - if "step not implemented" in str(err): - raise else: res_values.append(res) taker.append(i) @@ -661,11 +658,7 @@ def _numba_apply( out = obj._constructor(result, index=index, name=obj.name) return out else: - columns = ( - obj.columns - if result.shape[1] == len(obj.columns) - else obj.columns[:: self.step] - ) + columns = self._slice_index(obj.columns, result.T) out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) @@ -705,6 +698,9 @@ def __init__( # groupby., but unexpected to users in # groupby.rolling. obj = obj.drop(columns=self._grouper.names, errors="ignore") + # GH 15354 + if kwargs.get("step") is not None: + raise NotImplementedError("step not implemented for rolling groupby") super().__init__(obj, *args, **kwargs) def _apply( @@ -957,6 +953,16 @@ class Window(BaseWindow): The closed parameter with fixed windows is now supported. + step : int, default None + When supported, applies ``[::step]`` to the resulting sequence of windows, in a + computationally efficient manner. Currently supported only with fixed-length + window indexers. Note that using a step argument other than None or 1 will + produce a result with a different shape than the input. + + ..versionadded:: 1.5 + + The step parameter is only supported with fixed windows. + method : str {'single', 'table'}, default 'single' .. versionadded:: 1.3.0 @@ -1075,6 +1081,17 @@ class Window(BaseWindow): 3 3.0 4 6.0 + **step** + + Rolling sum with a window length of 2 observations, minimum of 1 observation to + calculate a value, and a step of 2. + + >>> df.rolling(2, min_periods=1, step=2).sum() + B + 0 0.0 + 2 3.0 + 4 4.0 + **win_type** Rolling sum with a window length of 2, using the Scipy ``'gaussian'`` @@ -1759,9 +1776,22 @@ def _validate(self): elif isinstance(self.window, BaseIndexer): # Passed BaseIndexer subclass should handle all other rolling kwargs - return + pass elif not is_integer(self.window) or self.window < 0: raise ValueError("window must be an integer 0 or greater") + # GH 15354: + # validate window indexer parameters do not raise in get_window_bounds + # this cannot be done in BaseWindow._validate because there _get_window_indexer + # would erroneously create a fixed window given a window argument like "1s" due + # to _win_freq_i8 not being set + indexer = self._get_window_indexer() + indexer.get_window_bounds( + num_values=0, + min_periods=self.min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) def _validate_monotonic(self): """ @@ -2647,15 +2677,6 @@ class RollingGroupby(BaseWindowGroupby, Rolling): Provide a rolling groupby implementation. """ - def __init__( - self, - *args, - **kwargs, - ): - if kwargs.get("step") is not None: - raise NotImplementedError("step not implemented for rolling groupby") - super().__init__(*args, **kwargs) - _attributes = Rolling._attributes + BaseWindowGroupby._attributes def _get_window_indexer(self) -> GroupbyIndexer: diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 2cee3ea1c1ec5..4704bdc62a1bf 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, @@ -54,7 +52,7 @@ def test_rolling_apply_out_of_bounds(engine_and_raw): @pytest.mark.parametrize("window", [2, "2s"]) -@td.step_not_implemented(when=lambda bargs: bargs.get("window") == "2s") +@pytest.mark.parametrize("step", [None]) def test_rolling_apply_with_pandas_objects(window, step): # 5071 df = DataFrame( @@ -98,7 +96,7 @@ def test_rolling_apply(engine_and_raw, step): tm.assert_series_equal(result, expected) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_all_apply(engine_and_raw, step): engine, raw = engine_and_raw @@ -116,7 +114,7 @@ def test_all_apply(engine_and_raw, step): tm.assert_frame_equal(result, expected) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_ragged_apply(engine_and_raw, step): engine, raw = engine_and_raw diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index a47a43845acfb..aef79f97bf93d 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, MultiIndex, @@ -261,7 +259,7 @@ def test_rolling_forward_cov_corr(func, expected): ["left", [0.0, 0.0, 1.0, 2.0, 5.0, 9.0, 5.0, 6.0, 7.0, 8.0]], ], ) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_non_fixed_variable_window_indexer(closed, expected_data, step): index = date_range("2020", periods=10) df = DataFrame(range(10), index=index) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 0374a1515d5f9..92bddac9e0da5 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -11,7 +11,6 @@ is_platform_mac, ) from pandas.errors import UnsupportedFunctionCall -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -27,8 +26,17 @@ ) import pandas._testing as tm from pandas.api.indexers import BaseIndexer +from pandas.core.indexers.objects import ( + ExpandingIndexer, + ExponentialMovingWindowIndexer, + GroupbyIndexer, + VariableOffsetWindowIndexer, + VariableWindowIndexer, +) from pandas.core.window import Rolling +from pandas.tseries.offsets import BusinessDay + def test_doc_string(): @@ -82,8 +90,38 @@ def test_invalid_constructor(frame_or_series, w): c(window=2, min_periods=1, center=w) +@pytest.mark.parametrize( + "window", + [ + timedelta(days=3), + Timedelta(days=3), + "3D", + ExpandingIndexer(window_size=3), + ExponentialMovingWindowIndexer(window_size=3), + GroupbyIndexer(window_size=3), + VariableOffsetWindowIndexer( + index=date_range("2015-12-26", periods=3), offset=BusinessDay(1) + ), + VariableWindowIndexer(window_size=3), + ], +) +def test_constructor_step_not_implemented(window, step): + # GH 15354 + n = 10 + df = DataFrame( + {"value": np.arange(n)}, + index=date_range("2015-12-24", periods=n, freq="D"), + ) + f = lambda: df.rolling(window=window, step=step) + if step is None: + f() + else: + with pytest.raises(NotImplementedError, match="step not implemented"): + f() + + @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3)]) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_constructor_with_timedelta_window(window, step): # GH 15440 n = 10 @@ -104,7 +142,7 @@ def test_constructor_with_timedelta_window(window, step): @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3), "3D"]) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_constructor_timedelta_window_and_minperiods(window, step, raw): # GH 15305 n = 10 @@ -138,7 +176,7 @@ def test_numpy_compat(method): @pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_closed_fixed(closed, arithmetic_win_operators, step): # GH 34315 func_name = arithmetic_win_operators @@ -204,7 +242,7 @@ def test_closed_fixed(closed, arithmetic_win_operators, step): ), ], ) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_datetimelike_centered_selections( closed, window_selections, step, arithmetic_win_operators ): @@ -245,7 +283,7 @@ def test_datetimelike_centered_selections( ("2s", "neither", [1.0, 2.0, 2.0]), ], ) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_datetimelike_centered_offset_covers_all( window, closed, expected, step, frame_or_series ): @@ -272,7 +310,7 @@ def test_datetimelike_centered_offset_covers_all( ("2D", "neither", [2, 2, 2, 2, 2, 2, 2, 2]), ], ) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_datetimelike_nonunique_index_centering( window, closed, expected, frame_or_series, step ): @@ -297,7 +335,7 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_even_number_window_alignment(step): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) @@ -339,7 +377,7 @@ def test_closed_fixed_binary_col(center, step): @pytest.mark.parametrize("closed", ["neither", "left"]) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_closed_empty(closed, arithmetic_win_operators, step): # GH 26005 func_name = arithmetic_win_operators @@ -352,7 +390,7 @@ def test_closed_empty(closed, arithmetic_win_operators, step): @pytest.mark.parametrize("func", ["min", "max"]) -@td.step_not_implemented() +@pytest.mark.parametrize("step", [None]) def test_closed_one_entry(func, step): # GH24718 ser = Series(data=[2], index=date_range("2000", periods=1)) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 7ef4a7766d835..10322a25ffd18 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -26,11 +26,6 @@ def test_foo(): from __future__ import annotations from contextlib import contextmanager -from functools import wraps -from inspect import ( - Parameter, - signature, -) import locale from typing import Callable import warnings @@ -317,38 +312,3 @@ def mark_array_manager_not_yet_implemented(request): get_option("mode.data_manager") == "array", reason="Test that relies on BlockManager internals or specific behaviour", ) - - -def step_not_implemented(when=None): - """ - Decorator factory for test cases expecting "step not implemented" errors. - - Parameters - ---------- - when : Callable - a callable accepting a BoundArguments object, which contains the arguments - passed to the test method, and returning True when a "step not implemented" - error is expected, provided the "step" argument is not None. Defaults to - always True, meaning that the error is expected only when the step argument - is not None. - """ - - def decorate_step_not_implemented(f): - expected_kinds = [Parameter.POSITIONAL_ONLY, Parameter.POSITIONAL_OR_KEYWORD] - sig = signature(f) - prms = sig.parameters - if prms.get("step") is None or prms["step"].kind not in expected_kinds: - raise ValueError(f"missing positional step parameter in {f}") - - @wraps(f) - def wrap_step_not_implemented(*args, **kwargs): - bargs = sig.bind(*args, **kwargs).arguments - if bargs.get("step") is not None and (when is None or when(bargs)): - with pytest.raises(NotImplementedError, match="step not implemented"): - return f(*args, **kwargs) - else: - return f(*args, **kwargs) - - return wrap_step_not_implemented - - return decorate_step_not_implemented From 3a9c089d5df1783631b06320ad4bc042c450c341 Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Mon, 21 Feb 2022 15:37:38 -0500 Subject: [PATCH 7/7] Add checks for expanding/ewm/groupby-rolling and rolling-cov/corr --- pandas/tests/window/test_rolling.py | 50 +++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 92bddac9e0da5..53e1d442d60a4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -100,19 +100,40 @@ def test_invalid_constructor(frame_or_series, w): ExponentialMovingWindowIndexer(window_size=3), GroupbyIndexer(window_size=3), VariableOffsetWindowIndexer( - index=date_range("2015-12-26", periods=3), offset=BusinessDay(1) + index=date_range("2015-12-25", periods=5), offset=BusinessDay(1) ), VariableWindowIndexer(window_size=3), ], ) -def test_constructor_step_not_implemented(window, step): +@pytest.mark.parametrize( + "func", + [ + lambda df: df.rolling, + lambda df: df.groupby("key").rolling, + ], +) +def test_constructor_step_not_implemented(window, func, step): # GH 15354 - n = 10 df = DataFrame( - {"value": np.arange(n)}, - index=date_range("2015-12-24", periods=n, freq="D"), + {"value": np.arange(10), "key": np.array([1] * 5 + [2] * 5)}, + index=date_range("2015-12-24", periods=10, freq="D"), + ) + f = lambda: func(df)(window=window, step=step) + if step is None: + f() + else: + with pytest.raises(NotImplementedError, match="step not implemented"): + f() + + +@pytest.mark.parametrize("agg", ["cov", "corr"]) +def test_constructor_step_not_implemented_for_cov_corr(agg, step): + # GH 15354 + df = DataFrame( + {"value": np.arange(10), "key": np.array([1] * 5 + [2] * 5)}, + index=date_range("2015-12-24", periods=10, freq="D"), ) - f = lambda: df.rolling(window=window, step=step) + f = lambda: getattr(df.rolling(window=2, step=step), agg)(df) if step is None: f() else: @@ -120,6 +141,23 @@ def test_constructor_step_not_implemented(window, step): f() +@pytest.mark.parametrize( + "func", + [ + lambda df: df.expanding, + lambda df: df.ewm, + ], +) +def test_constructor_step_unsupported(func, step): + # GH 15354 + df = DataFrame( + {"value": np.arange(10), "key": np.array([1] * 5 + [2] * 5)}, + index=date_range("2015-12-24", periods=10, freq="D"), + ) + with pytest.raises(TypeError, match="got an unexpected keyword argument 'step'"): + func(df)(step=step) + + @pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3)]) @pytest.mark.parametrize("step", [None]) def test_constructor_with_timedelta_window(window, step):