diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 2efc6c9167a83..9f1c4755bc54f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -732,6 +732,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) - Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`) - Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`) +- Bug in :meth:`Rolling.skew` and :meth:`Rolling.kurt` would give NaN with window of same values (:issue:`30993`) - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`) - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`) - Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index fdc39178238c9..68c05f2bb2c98 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -455,8 +455,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, - float64_t x, float64_t xx, - float64_t xxx) nogil: + float64_t x, float64_t xx, float64_t xxx, + int64_t num_consecutive_same_value + ) nogil: cdef: float64_t result, dnobs float64_t A, B, C, R @@ -467,6 +468,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, B = xx / dnobs - A * A C = xxx / dnobs - A * A * A - 3 * A * B + if nobs < 3: + result = NaN + # GH 42064 46431 + # uniform case, force result to be 0 + elif num_consecutive_same_value >= nobs: + result = 0.0 # #18044: with uniform distribution, floating issue will # cause B != 0. and cause the result is a very # large number. @@ -476,7 +483,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 - if B <= 1e-14 or nobs < 3: + elif B <= 1e-14: result = NaN else: R = sqrt(B) @@ -493,7 +500,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs, float64_t *xxx, float64_t *compensation_x, float64_t *compensation_xx, - float64_t *compensation_xxx) nogil: + float64_t *compensation_xxx, + int64_t *num_consecutive_same_value, + float64_t *prev_value, + ) nogil: """ add a value from the skew calc """ cdef: float64_t y, t @@ -515,6 +525,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs, compensation_xxx[0] = t - xxx[0] - y xxx[0] = t + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef inline void remove_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, @@ -553,8 +571,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_xx_add, compensation_xx_remove float64_t compensation_x_add, compensation_x_remove float64_t x, xx, xxx + float64_t prev_value int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0 - int64_t s, e + int64_t s, e, num_consecutive_same_value ndarray[float64_t] output, mean_array, values_copy bint is_monotonic_increasing_bounds @@ -588,6 +607,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, # never removed if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + prev_value = values[s] + num_consecutive_same_value = 0 + compensation_xxx_add = compensation_xxx_remove = 0 compensation_xx_add = compensation_xx_remove = 0 compensation_x_add = compensation_x_remove = 0 @@ -596,7 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(s, e): val = values_copy[j] add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, - &compensation_xx_add, &compensation_xxx_add) + &compensation_xx_add, &compensation_xxx_add, + &num_consecutive_same_value, &prev_value) else: @@ -612,9 +635,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(end[i - 1], e): val = values_copy[j] add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, - &compensation_xx_add, &compensation_xxx_add) + &compensation_xx_add, &compensation_xxx_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_skew(minp, nobs, x, xx, xxx) + output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value) if not is_monotonic_increasing_bounds: nobs = 0 @@ -630,35 +654,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, float64_t x, float64_t xx, - float64_t xxx, float64_t xxxx) nogil: + float64_t xxx, float64_t xxxx, + int64_t num_consecutive_same_value, + ) nogil: cdef: float64_t result, dnobs float64_t A, B, C, D, R, K if nobs >= minp: - dnobs = nobs - A = x / dnobs - R = A * A - B = xx / dnobs - R - R = R * A - C = xxx / dnobs - R - 3 * A * B - R = R * A - D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A - - # #18044: with uniform distribution, floating issue will - # cause B != 0. and cause the result is a very - # large number. - # - # in core/nanops.py nanskew/nankurt call the function - # _zero_out_fperr(m2) to fix floating error. - # if the variance is less than 1e-14, it could be - # treat as zero, here we follow the original - # skew/kurt behaviour to check B <= 1e-14 - if B <= 1e-14 or nobs < 4: + if nobs < 4: result = NaN + # GH 42064 46431 + # uniform case, force result to be -3. + elif num_consecutive_same_value >= nobs: + result = -3. else: - K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) - result = K / ((dnobs - 2.) * (dnobs - 3.)) + dnobs = nobs + A = x / dnobs + R = A * A + B = xx / dnobs - R + R = R * A + C = xxx / dnobs - R - 3 * A * B + R = R * A + D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A + + # #18044: with uniform distribution, floating issue will + # cause B != 0. and cause the result is a very + # large number. + # + # in core/nanops.py nanskew/nankurt call the function + # _zero_out_fperr(m2) to fix floating error. + # if the variance is less than 1e-14, it could be + # treat as zero, here we follow the original + # skew/kurt behaviour to check B <= 1e-14 + if B <= 1e-14: + result = NaN + else: + K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) + result = K / ((dnobs - 2.) * (dnobs - 3.)) else: result = NaN @@ -671,7 +704,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs, float64_t *compensation_x, float64_t *compensation_xx, float64_t *compensation_xxx, - float64_t *compensation_xxxx) nogil: + float64_t *compensation_xxxx, + int64_t *num_consecutive_same_value, + float64_t *prev_value + ) nogil: """ add a value from the kurotic calc """ cdef: float64_t y, t @@ -697,6 +733,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs, compensation_xxxx[0] = t - xxxx[0] - y xxxx[0] = t + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef inline void remove_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, @@ -741,7 +785,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_xx_remove, compensation_xx_add float64_t compensation_x_remove, compensation_x_add float64_t x, xx, xxx, xxxx - int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0 + float64_t prev_value + int64_t nobs, s, e, num_consecutive_same_value + int64_t N = len(start), V = len(values), nobs_mean = 0 ndarray[float64_t] output, values_copy bint is_monotonic_increasing_bounds @@ -775,6 +821,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # never removed if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + prev_value = values[s] + num_consecutive_same_value = 0 + compensation_xxxx_add = compensation_xxxx_remove = 0 compensation_xxx_remove = compensation_xxx_add = 0 compensation_xx_remove = compensation_xx_add = 0 @@ -784,7 +833,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(s, e): add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_add, &compensation_xx_add, - &compensation_xxx_add, &compensation_xxxx_add) + &compensation_xxx_add, &compensation_xxxx_add, + &num_consecutive_same_value, &prev_value) else: @@ -800,9 +850,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(end[i - 1], e): add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_add, &compensation_xx_add, - &compensation_xxx_add, &compensation_xxxx_add) + &compensation_xxx_add, &compensation_xxxx_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value) if not is_monotonic_increasing_bounds: nobs = 0 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9c46a7194c9c5..4c26cfb95fd85 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1860,3 +1860,14 @@ def test_rolling_mean_sum_floating_artifacts(): assert (result[-3:] == 0).all() result = r.sum() assert (result[-3:] == 0).all() + + +def test_rolling_skew_kurt_floating_artifacts(): + # GH 42064 46431 + + sr = Series([1 / 3, 4, 0, 0, 0, 0, 0]) + r = sr.rolling(4) + result = r.skew() + assert (result[-2:] == 0).all() + result = r.kurt() + assert (result[-2:] == -3).all() diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 152172d7b2266..4489fada9c11e 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -178,17 +178,18 @@ def test_center_reindex_frame(frame, roll_func): def test_rolling_skew_edge_cases(step): - all_nan = Series([np.NaN] * 5)[::step] - + expected = Series([np.NaN] * 4 + [0.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).skew() - tm.assert_series_equal(all_nan, x) + # index 4 should be 0 as it contains 5 same obs + tm.assert_series_equal(expected, x) + expected = Series([np.NaN] * 5)[::step] # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=2, step=step).skew() - tm.assert_series_equal(all_nan, x) + tm.assert_series_equal(expected, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) @@ -199,17 +200,18 @@ def test_rolling_skew_edge_cases(step): def test_rolling_kurt_edge_cases(step): - all_nan = Series([np.NaN] * 5)[::step] + expected = Series([np.NaN] * 4 + [-3.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).kurt() - tm.assert_series_equal(all_nan, x) + tm.assert_series_equal(expected, x) # yields all NaN (window too small) + expected = Series([np.NaN] * 5)[::step] d = Series(np.random.randn(5)) x = d.rolling(window=3, step=step).kurt() - tm.assert_series_equal(all_nan, x) + tm.assert_series_equal(expected, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) @@ -220,11 +222,15 @@ def test_rolling_kurt_edge_cases(step): def test_rolling_skew_eq_value_fperr(step): # #18804 all rolling skew for all equal values should return Nan + # #46717 update: all equal values should return 0 instead of NaN a = Series([1.1] * 15).rolling(window=10, step=step).skew() - assert np.isnan(a).all() + assert (a[a.index >= 9] == 0).all() + assert a[a.index < 9].isna().all() def test_rolling_kurt_eq_value_fperr(step): # #18804 all rolling kurt for all equal values should return Nan + # #46717 update: all equal values should return -3 instead of NaN a = Series([1.1] * 15).rolling(window=10, step=step).kurt() - assert np.isnan(a).all() + assert (a[a.index >= 9] == -3).all() + assert a[a.index < 9].isna().all()