Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EHN: add same value count for skew & kurt #46717

Merged
merged 15 commits into from
May 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ Groupby/resample/rolling
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
- Bug in :meth:`Rolling.skew` and :meth:`Rolling.kurt` would give NaN with window of same values (:issue:`30993`)
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
- Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`)
Expand Down
121 changes: 86 additions & 35 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,


cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
float64_t x, float64_t xx,
float64_t xxx) nogil:
float64_t x, float64_t xx, float64_t xxx,
int64_t num_consecutive_same_value
) nogil:
cdef:
float64_t result, dnobs
float64_t A, B, C, R
Expand All @@ -467,6 +468,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
B = xx / dnobs - A * A
C = xxx / dnobs - A * A * A - 3 * A * B

if nobs < 3:
result = NaN
# GH 42064 46431
# uniform case, force result to be 0
elif num_consecutive_same_value >= nobs:
result = 0.0
# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
# large number.
Expand All @@ -476,7 +483,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14 or nobs < 3:
elif B <= 1e-14:
result = NaN
else:
R = sqrt(B)
Expand All @@ -493,7 +500,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
float64_t *xxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx) nogil:
float64_t *compensation_xxx,
int64_t *num_consecutive_same_value,
float64_t *prev_value,
) nogil:
""" add a value from the skew calc """
cdef:
float64_t y, t
Expand All @@ -515,6 +525,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef inline void remove_skew(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
Expand Down Expand Up @@ -553,8 +571,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
float64_t compensation_xx_add, compensation_xx_remove
float64_t compensation_x_add, compensation_x_remove
float64_t x, xx, xxx
float64_t prev_value
int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
int64_t s, e
int64_t s, e, num_consecutive_same_value
ndarray[float64_t] output, mean_array, values_copy
bint is_monotonic_increasing_bounds

Expand Down Expand Up @@ -588,6 +607,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
# never removed
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

prev_value = values[s]
num_consecutive_same_value = 0

compensation_xxx_add = compensation_xxx_remove = 0
compensation_xx_add = compensation_xx_remove = 0
compensation_x_add = compensation_x_remove = 0
Expand All @@ -596,7 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(s, e):
val = values_copy[j]
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)
&compensation_xx_add, &compensation_xxx_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -612,9 +635,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(end[i - 1], e):
val = values_copy[j]
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)
&compensation_xx_add, &compensation_xxx_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_skew(minp, nobs, x, xx, xxx)
output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)

if not is_monotonic_increasing_bounds:
nobs = 0
Expand All @@ -630,35 +654,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,

cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
float64_t x, float64_t xx,
float64_t xxx, float64_t xxxx) nogil:
float64_t xxx, float64_t xxxx,
int64_t num_consecutive_same_value,
) nogil:
cdef:
float64_t result, dnobs
float64_t A, B, C, D, R, K

if nobs >= minp:
dnobs = <float64_t>nobs
A = x / dnobs
R = A * A
B = xx / dnobs - R
R = R * A
C = xxx / dnobs - R - 3 * A * B
R = R * A
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A

# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
# large number.
#
# in core/nanops.py nanskew/nankurt call the function
# _zero_out_fperr(m2) to fix floating error.
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14 or nobs < 4:
if nobs < 4:
result = NaN
# GH 42064 46431
# uniform case, force result to be -3.
elif num_consecutive_same_value >= nobs:
result = -3.
else:
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
result = K / ((dnobs - 2.) * (dnobs - 3.))
dnobs = <float64_t>nobs
A = x / dnobs
R = A * A
B = xx / dnobs - R
R = R * A
C = xxx / dnobs - R - 3 * A * B
R = R * A
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A

# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
# large number.
#
# in core/nanops.py nanskew/nankurt call the function
# _zero_out_fperr(m2) to fix floating error.
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14:
result = NaN
else:
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
result = K / ((dnobs - 2.) * (dnobs - 3.))
else:
result = NaN

Expand All @@ -671,7 +704,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) nogil:
float64_t *compensation_xxxx,
int64_t *num_consecutive_same_value,
float64_t *prev_value
) nogil:
""" add a value from the kurotic calc """
cdef:
float64_t y, t
Expand All @@ -697,6 +733,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef inline void remove_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
Expand Down Expand Up @@ -741,7 +785,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
float64_t compensation_xx_remove, compensation_xx_add
float64_t compensation_x_remove, compensation_x_add
float64_t x, xx, xxx, xxxx
int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
float64_t prev_value
int64_t nobs, s, e, num_consecutive_same_value
int64_t N = len(start), V = len(values), nobs_mean = 0
ndarray[float64_t] output, values_copy
bint is_monotonic_increasing_bounds

Expand Down Expand Up @@ -775,6 +821,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
# never removed
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

prev_value = values[s]
num_consecutive_same_value = 0

compensation_xxxx_add = compensation_xxxx_remove = 0
compensation_xxx_remove = compensation_xxx_add = 0
compensation_xx_remove = compensation_xx_add = 0
Expand All @@ -784,7 +833,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(s, e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)
&compensation_xxx_add, &compensation_xxxx_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -800,9 +850,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(end[i - 1], e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)
&compensation_xxx_add, &compensation_xxxx_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value)

if not is_monotonic_increasing_bounds:
nobs = 0
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,3 +1860,14 @@ def test_rolling_mean_sum_floating_artifacts():
assert (result[-3:] == 0).all()
result = r.sum()
assert (result[-3:] == 0).all()


def test_rolling_skew_kurt_floating_artifacts():
# GH 42064 46431

sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
r = sr.rolling(4)
result = r.skew()
assert (result[-2:] == 0).all()
result = r.kurt()
assert (result[-2:] == -3).all()
24 changes: 15 additions & 9 deletions pandas/tests/window/test_rolling_skew_kurt.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,17 +178,18 @@ def test_center_reindex_frame(frame, roll_func):

def test_rolling_skew_edge_cases(step):

all_nan = Series([np.NaN] * 5)[::step]

expected = Series([np.NaN] * 4 + [0.0])[::step]
# yields all NaN (0 variance)
d = Series([1] * 5)
x = d.rolling(window=5, step=step).skew()
tm.assert_series_equal(all_nan, x)
# index 4 should be 0 as it contains 5 same obs
tm.assert_series_equal(expected, x)

expected = Series([np.NaN] * 5)[::step]
# yields all NaN (window too small)
d = Series(np.random.randn(5))
x = d.rolling(window=2, step=step).skew()
tm.assert_series_equal(all_nan, x)
tm.assert_series_equal(expected, x)

# yields [NaN, NaN, NaN, 0.177994, 1.548824]
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
Expand All @@ -199,17 +200,18 @@ def test_rolling_skew_edge_cases(step):

def test_rolling_kurt_edge_cases(step):

all_nan = Series([np.NaN] * 5)[::step]
expected = Series([np.NaN] * 4 + [-3.0])[::step]

# yields all NaN (0 variance)
d = Series([1] * 5)
x = d.rolling(window=5, step=step).kurt()
tm.assert_series_equal(all_nan, x)
tm.assert_series_equal(expected, x)

# yields all NaN (window too small)
expected = Series([np.NaN] * 5)[::step]
d = Series(np.random.randn(5))
x = d.rolling(window=3, step=step).kurt()
tm.assert_series_equal(all_nan, x)
tm.assert_series_equal(expected, x)

# yields [NaN, NaN, NaN, 1.224307, 2.671499]
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
Expand All @@ -220,11 +222,15 @@ def test_rolling_kurt_edge_cases(step):

def test_rolling_skew_eq_value_fperr(step):
# #18804 all rolling skew for all equal values should return Nan
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
# #46717 update: all equal values should return 0 instead of NaN
a = Series([1.1] * 15).rolling(window=10, step=step).skew()
assert np.isnan(a).all()
assert (a[a.index >= 9] == 0).all()
assert a[a.index < 9].isna().all()


def test_rolling_kurt_eq_value_fperr(step):
# #18804 all rolling kurt for all equal values should return Nan
# #46717 update: all equal values should return -3 instead of NaN
a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
assert np.isnan(a).all()
assert (a[a.index >= 9] == -3).all()
assert a[a.index < 9].isna().all()