Skip to content

Commit

Permalink
EHN: add same value count for skew & kurt (pandas-dev#46717)
Browse files Browse the repository at this point in the history
* add same value count for skew & kurt

* add same value count for skew & kurt

* update doc

* merged

* restore old comments

Co-authored-by: auderson <liao.renjie@techfin.ai>
  • Loading branch information
2 people authored and yehoshuadimarsky committed Jul 13, 2022
1 parent 0979db5 commit bf9ea1d
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 44 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ Groupby/resample/rolling
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
- Bug in :meth:`Rolling.skew` and :meth:`Rolling.kurt` would give NaN with window of same values (:issue:`30993`)
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
- Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`)
Expand Down
121 changes: 86 additions & 35 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,


cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
float64_t x, float64_t xx,
float64_t xxx) nogil:
float64_t x, float64_t xx, float64_t xxx,
int64_t num_consecutive_same_value
) nogil:
cdef:
float64_t result, dnobs
float64_t A, B, C, R
Expand All @@ -467,6 +468,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
B = xx / dnobs - A * A
C = xxx / dnobs - A * A * A - 3 * A * B

if nobs < 3:
result = NaN
# GH 42064 46431
# uniform case, force result to be 0
elif num_consecutive_same_value >= nobs:
result = 0.0
# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
# large number.
Expand All @@ -476,7 +483,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14 or nobs < 3:
elif B <= 1e-14:
result = NaN
else:
R = sqrt(B)
Expand All @@ -493,7 +500,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
float64_t *xxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx) nogil:
float64_t *compensation_xxx,
int64_t *num_consecutive_same_value,
float64_t *prev_value,
) nogil:
""" add a value from the skew calc """
cdef:
float64_t y, t
Expand All @@ -515,6 +525,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef inline void remove_skew(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
Expand Down Expand Up @@ -553,8 +571,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
float64_t compensation_xx_add, compensation_xx_remove
float64_t compensation_x_add, compensation_x_remove
float64_t x, xx, xxx
float64_t prev_value
int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
int64_t s, e
int64_t s, e, num_consecutive_same_value
ndarray[float64_t] output, mean_array, values_copy
bint is_monotonic_increasing_bounds

Expand Down Expand Up @@ -588,6 +607,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
# never removed
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

prev_value = values[s]
num_consecutive_same_value = 0

compensation_xxx_add = compensation_xxx_remove = 0
compensation_xx_add = compensation_xx_remove = 0
compensation_x_add = compensation_x_remove = 0
Expand All @@ -596,7 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(s, e):
val = values_copy[j]
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)
&compensation_xx_add, &compensation_xxx_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -612,9 +635,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(end[i - 1], e):
val = values_copy[j]
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)
&compensation_xx_add, &compensation_xxx_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_skew(minp, nobs, x, xx, xxx)
output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)

if not is_monotonic_increasing_bounds:
nobs = 0
Expand All @@ -630,35 +654,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,

cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
float64_t x, float64_t xx,
float64_t xxx, float64_t xxxx) nogil:
float64_t xxx, float64_t xxxx,
int64_t num_consecutive_same_value,
) nogil:
cdef:
float64_t result, dnobs
float64_t A, B, C, D, R, K

if nobs >= minp:
dnobs = <float64_t>nobs
A = x / dnobs
R = A * A
B = xx / dnobs - R
R = R * A
C = xxx / dnobs - R - 3 * A * B
R = R * A
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A

# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
# large number.
#
# in core/nanops.py nanskew/nankurt call the function
# _zero_out_fperr(m2) to fix floating error.
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14 or nobs < 4:
if nobs < 4:
result = NaN
# GH 42064 46431
# uniform case, force result to be -3.
elif num_consecutive_same_value >= nobs:
result = -3.
else:
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
result = K / ((dnobs - 2.) * (dnobs - 3.))
dnobs = <float64_t>nobs
A = x / dnobs
R = A * A
B = xx / dnobs - R
R = R * A
C = xxx / dnobs - R - 3 * A * B
R = R * A
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A

# #18044: with uniform distribution, floating issue will
# cause B != 0. and cause the result is a very
# large number.
#
# in core/nanops.py nanskew/nankurt call the function
# _zero_out_fperr(m2) to fix floating error.
# if the variance is less than 1e-14, it could be
# treat as zero, here we follow the original
# skew/kurt behaviour to check B <= 1e-14
if B <= 1e-14:
result = NaN
else:
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
result = K / ((dnobs - 2.) * (dnobs - 3.))
else:
result = NaN

Expand All @@ -671,7 +704,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) nogil:
float64_t *compensation_xxxx,
int64_t *num_consecutive_same_value,
float64_t *prev_value
) nogil:
""" add a value from the kurotic calc """
cdef:
float64_t y, t
Expand All @@ -697,6 +733,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef inline void remove_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
Expand Down Expand Up @@ -741,7 +785,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
float64_t compensation_xx_remove, compensation_xx_add
float64_t compensation_x_remove, compensation_x_add
float64_t x, xx, xxx, xxxx
int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
float64_t prev_value
int64_t nobs, s, e, num_consecutive_same_value
int64_t N = len(start), V = len(values), nobs_mean = 0
ndarray[float64_t] output, values_copy
bint is_monotonic_increasing_bounds

Expand Down Expand Up @@ -775,6 +821,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
# never removed
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

prev_value = values[s]
num_consecutive_same_value = 0

compensation_xxxx_add = compensation_xxxx_remove = 0
compensation_xxx_remove = compensation_xxx_add = 0
compensation_xx_remove = compensation_xx_add = 0
Expand All @@ -784,7 +833,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(s, e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)
&compensation_xxx_add, &compensation_xxxx_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -800,9 +850,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
for j in range(end[i - 1], e):
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)
&compensation_xxx_add, &compensation_xxxx_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value)

if not is_monotonic_increasing_bounds:
nobs = 0
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,3 +1860,14 @@ def test_rolling_mean_sum_floating_artifacts():
assert (result[-3:] == 0).all()
result = r.sum()
assert (result[-3:] == 0).all()


def test_rolling_skew_kurt_floating_artifacts():
# GH 42064 46431

sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
r = sr.rolling(4)
result = r.skew()
assert (result[-2:] == 0).all()
result = r.kurt()
assert (result[-2:] == -3).all()
24 changes: 15 additions & 9 deletions pandas/tests/window/test_rolling_skew_kurt.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,17 +178,18 @@ def test_center_reindex_frame(frame, roll_func):

def test_rolling_skew_edge_cases(step):

all_nan = Series([np.NaN] * 5)[::step]

expected = Series([np.NaN] * 4 + [0.0])[::step]
# yields all NaN (0 variance)
d = Series([1] * 5)
x = d.rolling(window=5, step=step).skew()
tm.assert_series_equal(all_nan, x)
# index 4 should be 0 as it contains 5 same obs
tm.assert_series_equal(expected, x)

expected = Series([np.NaN] * 5)[::step]
# yields all NaN (window too small)
d = Series(np.random.randn(5))
x = d.rolling(window=2, step=step).skew()
tm.assert_series_equal(all_nan, x)
tm.assert_series_equal(expected, x)

# yields [NaN, NaN, NaN, 0.177994, 1.548824]
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
Expand All @@ -199,17 +200,18 @@ def test_rolling_skew_edge_cases(step):

def test_rolling_kurt_edge_cases(step):

all_nan = Series([np.NaN] * 5)[::step]
expected = Series([np.NaN] * 4 + [-3.0])[::step]

# yields all NaN (0 variance)
d = Series([1] * 5)
x = d.rolling(window=5, step=step).kurt()
tm.assert_series_equal(all_nan, x)
tm.assert_series_equal(expected, x)

# yields all NaN (window too small)
expected = Series([np.NaN] * 5)[::step]
d = Series(np.random.randn(5))
x = d.rolling(window=3, step=step).kurt()
tm.assert_series_equal(all_nan, x)
tm.assert_series_equal(expected, x)

# yields [NaN, NaN, NaN, 1.224307, 2.671499]
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
Expand All @@ -220,11 +222,15 @@ def test_rolling_kurt_edge_cases(step):

def test_rolling_skew_eq_value_fperr(step):
# #18804 all rolling skew for all equal values should return Nan
# #46717 update: all equal values should return 0 instead of NaN
a = Series([1.1] * 15).rolling(window=10, step=step).skew()
assert np.isnan(a).all()
assert (a[a.index >= 9] == 0).all()
assert a[a.index < 9].isna().all()


def test_rolling_kurt_eq_value_fperr(step):
# #18804 all rolling kurt for all equal values should return Nan
# #46717 update: all equal values should return -3 instead of NaN
a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
assert np.isnan(a).all()
assert (a[a.index >= 9] == -3).all()
assert a[a.index < 9].isna().all()

0 comments on commit bf9ea1d

Please sign in to comment.