Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: use same value counts for roll_var to remove floating point artifacts #46431

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ac1cec4
fix merge
auderson Mar 25, 2022
278393e
Merge remote-tracking branch 'upstream/main' into roll_var_remove_flo…
auderson Mar 25, 2022
b381516
bug fix: nobs == 0 == min_periods
auderson Mar 26, 2022
85f3fcd
use Py_ssize_t in place of int64_t
auderson Mar 26, 2022
318adee
use tm.assert_series_equal
auderson Mar 26, 2022
7324cc1
add more checks in roll numba methods
auderson Mar 26, 2022
3bf1b15
Merge remote-tracking branch 'upstream/main' into roll_var_remove_flo…
auderson Mar 26, 2022
ad3a87c
add more checks in roll numba methods
auderson Mar 26, 2022
66318d1
cancel check exact
auderson Mar 26, 2022
7909ffe
test 32bit: undo Py_ssize_t
auderson Mar 26, 2022
5bd7eac
add explanation for test_rolling_var_same_value_count_logic
auderson Mar 29, 2022
8b9bf76
comments updated
Mar 30, 2022
2ad5e6b
Merge remote-tracking branch 'upstream/main' into roll_var_remove_flo…
Mar 30, 2022
fd68eb4
add equal-zero test for test_rolling_var_numerical_issues & test_roll…
Apr 1, 2022
a7c2368
Merge remote-tracking branch 'upstream/main' into roll_var_remove_flo…
auderson Apr 10, 2022
c49d785
update docs
auderson Apr 10, 2022
4294c16
add more tests
auderson Apr 11, 2022
738c45b
update whats new
auderson Apr 11, 2022
ca1ee1a
update whats new
auderson Apr 11, 2022
7fa241b
Merge remote-tracking branch 'upstream/main' into roll_var_remove_flo…
Apr 12, 2022
8f0eba6
update doc
Apr 12, 2022
e57f86c
Merge branch 'main' into roll_var_remove_floating_point_artifacts
jreback Apr 18, 2022
4110084
Merge remote-tracking branch 'upstream/main' into roll_var_remove_flo…
Apr 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ Other enhancements
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`)
auderson marked this conversation as resolved.
Show resolved Hide resolved
- :meth:`DataFrame.rolling.skew`, :meth:`Series.rolling.skew` now generate correct result 0 for window containing same values (:issue:`42064`)
- :meth:`DataFrame.rolling.kurt`, :meth:`Series.rolling.kurt` now generate correct result -3 for window containing same values (:issue:`42064`, :issue:`30993`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
34 changes: 24 additions & 10 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,


cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
float64_t ssqdm_x) nogil:
float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil:
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
cdef:
float64_t result

# Variance is unchanged if no observation is added or removed
if (nobs >= minp) and (nobs > ddof):

# pathological case
if nobs == 1:
# pathological case & repeatedly same values case
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0
else:
result = ssqdm_x / (nobs - <float64_t>ddof)
Expand All @@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,


cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
float64_t *ssqdm_x, float64_t *compensation) nogil:
float64_t *ssqdm_x, float64_t *compensation,
int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
""" add a value from the var calc """
cdef:
float64_t delta, prev_mean, y, t
Expand All @@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
return

nobs[0] = nobs[0] + 1

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val

# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
Expand Down Expand Up @@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
"""
cdef:
float64_t mean_x, ssqdm_x, nobs, compensation_add,
float64_t compensation_remove,
float64_t val, prev, delta, mean_x_old
int64_t s, e
float64_t compensation_remove, prev_value
int64_t s, e, num_consecutive_same_value
Py_ssize_t i, j, N = len(start)
ndarray[float64_t] output
bint is_monotonic_increasing_bounds
Expand All @@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
# never removed
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

prev_value = values[s]
num_consecutive_same_value = 0
mroeschke marked this conversation as resolved.
Show resolved Hide resolved

mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
for j in range(s, e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,

# calculate adds
for j in range(end[i - 1], e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)

if not is_monotonic_increasing_bounds:
nobs = 0.0
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/_numba/kernels/sum_.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def sliding_sum(
val, nobs, sum_x, compensation_add
)

if nobs == 0 == nobs:
if nobs == 0 == min_periods:
result = 0.0
elif nobs >= min_periods:
result = sum_x
Expand Down
59 changes: 51 additions & 8 deletions pandas/core/_numba/kernels/var_.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,22 @@

@numba.jit(nopython=True, nogil=True, parallel=False)
def add_var(
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
) -> tuple[int, float, float, float]:
val: float,
nobs: int,
mean_x: float,
ssqdm_x: float,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, float, float, int, float]:
if not np.isnan(val):

if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val

nobs += 1
prev_mean = mean_x - compensation
y = val - compensation
Expand All @@ -30,7 +43,7 @@ def add_var(
else:
mean_x = 0
ssqdm_x += (val - prev_mean) * (val - mean_x)
return nobs, mean_x, ssqdm_x, compensation
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value


@numba.jit(nopython=True, nogil=True, parallel=False)
Expand Down Expand Up @@ -79,10 +92,27 @@ def sliding_var(
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:

prev_value = values[s]
num_consecutive_same_value = 0

for j in range(s, e):
val = values[j]
nobs, mean_x, ssqdm_x, compensation_add = add_var(
val, nobs, mean_x, ssqdm_x, compensation_add
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
Expand All @@ -93,12 +123,25 @@ def sliding_var(

for j in range(end[i - 1], e):
val = values[j]
nobs, mean_x, ssqdm_x, compensation_add = add_var(
val, nobs, mean_x, ssqdm_x, compensation_add
(
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_var(
val,
nobs,
mean_x,
ssqdm_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)

if nobs >= min_periods and nobs > ddof:
if nobs == 1:
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0.0
else:
result = ssqdm_x / (nobs - ddof)
Expand Down
38 changes: 16 additions & 22 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2150,24 +2150,21 @@ def median(
The default ``ddof`` of 1 used in :meth:`Series.std` is different
than the default ``ddof`` of 0 in :func:`numpy.std`.

A minimum of one period is required for the rolling calculation.

The implementation is susceptible to floating point imprecision as
shown in the example below.\n
A minimum of one period is required for the rolling calculation.\n
"""
).replace("\n", "", 1),
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.rolling(3).std()
0 NaN
1 NaN
2 5.773503e-01
3 1.000000e+00
4 1.000000e+00
5 1.154701e+00
6 2.580957e-08
0 NaN
1 NaN
2 0.577350
3 1.000000
4 1.000000
5 1.154701
6 0.000000
dtype: float64
"""
).replace("\n", "", 1),
Expand Down Expand Up @@ -2212,24 +2209,21 @@ def std(
The default ``ddof`` of 1 used in :meth:`Series.var` is different
than the default ``ddof`` of 0 in :func:`numpy.var`.

A minimum of one period is required for the rolling calculation.

The implementation is susceptible to floating point imprecision as
shown in the example below.\n
A minimum of one period is required for the rolling calculation.\n
"""
).replace("\n", "", 1),
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.rolling(3).var()
0 NaN
1 NaN
2 3.333333e-01
3 1.000000e+00
4 1.000000e+00
5 1.333333e+00
6 6.661338e-16
0 NaN
1 NaN
2 0.333333
3 1.000000
4 1.000000
5 1.333333
6 0.000000
dtype: float64
"""
).replace("\n", "", 1),
Expand Down
17 changes: 15 additions & 2 deletions pandas/tests/window/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,20 @@ def f(x, *args):
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]
"data",
[
DataFrame(np.eye(5)),
DataFrame(
[
[5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
[5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
[np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5],
]
).T,
Series(range(5), name="foo"),
Series([20, 10, 10, np.inf, 1, 1, 2, 3]),
Series([20, 10, 10, np.nan, 10, 1, 2, 3]),
],
)
def test_numba_vs_cython_rolling_methods(
self,
Expand All @@ -95,7 +108,7 @@ def test_numba_vs_cython_rolling_methods(

engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}

roll = data.rolling(2, step=step)
roll = data.rolling(3, step=step)
result = getattr(roll, method)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
Expand Down
Loading