EHN: add same value count for skew & kurt (pandas-dev#46717)

* add same value count for skew & kurt * add same value count for skew & kurt * update doc * merged * restore old comments Co-authored-by: auderson <liao.renjie@techfin.ai>
yehoshuadimarsky · Jul 13, 2022 · bf9ea1d · bf9ea1d
1 parent 0979db5
commit bf9ea1d
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 44 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -732,6 +732,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
 - Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
 - Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
+- Bug in :meth:`Rolling.skew` and :meth:`Rolling.kurt` would give NaN with window of same values (:issue:`30993`)
 - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
 - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
 - Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`)

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -455,8 +455,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
 
 cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
-                                float64_t x, float64_t xx,
-                                float64_t xxx) nogil:
+                                float64_t x, float64_t xx, float64_t xxx,
+                                int64_t num_consecutive_same_value
+                                ) nogil:
     cdef:
         float64_t result, dnobs
         float64_t A, B, C, R
@@ -467,6 +468,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
         B = xx / dnobs - A * A
         C = xxx / dnobs - A * A * A - 3 * A * B
 
+        if nobs < 3:
+            result = NaN
+        # GH 42064 46431
+        # uniform case, force result to be 0
+        elif num_consecutive_same_value >= nobs:
+            result = 0.0
         # #18044: with uniform distribution, floating issue will
         #         cause B != 0. and cause the result is a very
         #         large number.
@@ -476,7 +483,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
         #         if the variance is less than 1e-14, it could be
         #         treat as zero, here we follow the original
         #         skew/kurt behaviour to check B <= 1e-14
-        if B <= 1e-14 or nobs < 3:
+        elif B <= 1e-14:
             result = NaN
         else:
             R = sqrt(B)
@@ -493,7 +500,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
                           float64_t *xxx,
                           float64_t *compensation_x,
                           float64_t *compensation_xx,
-                          float64_t *compensation_xxx) nogil:
+                          float64_t *compensation_xxx,
+                          int64_t *num_consecutive_same_value,
+                          float64_t *prev_value,
+                          ) nogil:
     """ add a value from the skew calc """
     cdef:
         float64_t y, t
@@ -515,6 +525,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
         compensation_xxx[0] = t - xxx[0] - y
         xxx[0] = t
 
+        # GH#42064, record num of same values to remove floating point artifacts
+        if val == prev_value[0]:
+            num_consecutive_same_value[0] += 1
+        else:
+            # reset to 1 (include current value itself)
+            num_consecutive_same_value[0] = 1
+        prev_value[0] = val
+
 
 cdef inline void remove_skew(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
@@ -553,8 +571,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
         float64_t compensation_xx_add, compensation_xx_remove
         float64_t compensation_x_add, compensation_x_remove
         float64_t x, xx, xxx
+        float64_t prev_value
         int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
-        int64_t s, e
+        int64_t s, e, num_consecutive_same_value
         ndarray[float64_t] output, mean_array, values_copy
         bint is_monotonic_increasing_bounds
 
@@ -588,6 +607,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 compensation_xxx_add = compensation_xxx_remove = 0
                 compensation_xx_add = compensation_xx_remove = 0
                 compensation_x_add = compensation_x_remove = 0
@@ -596,7 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(s, e):
                     val = values_copy[j]
                     add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
-                             &compensation_xx_add, &compensation_xxx_add)
+                             &compensation_xx_add, &compensation_xxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -612,9 +635,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(end[i - 1], e):
                     val = values_copy[j]
                     add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
-                             &compensation_xx_add, &compensation_xxx_add)
+                             &compensation_xx_add, &compensation_xxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_skew(minp, nobs, x, xx, xxx)
+            output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0
@@ -630,35 +654,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
 
 cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
                                 float64_t x, float64_t xx,
-                                float64_t xxx, float64_t xxxx) nogil:
+                                float64_t xxx, float64_t xxxx,
+                                int64_t num_consecutive_same_value,
+                                ) nogil:
     cdef:
         float64_t result, dnobs
         float64_t A, B, C, D, R, K
 
     if nobs >= minp:
-        dnobs = <float64_t>nobs
-        A = x / dnobs
-        R = A * A
-        B = xx / dnobs - R
-        R = R * A
-        C = xxx / dnobs - R - 3 * A * B
-        R = R * A
-        D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
-
-        # #18044: with uniform distribution, floating issue will
-        #         cause B != 0. and cause the result is a very
-        #         large number.
-        #
-        #         in core/nanops.py nanskew/nankurt call the function
-        #         _zero_out_fperr(m2) to fix floating error.
-        #         if the variance is less than 1e-14, it could be
-        #         treat as zero, here we follow the original
-        #         skew/kurt behaviour to check B <= 1e-14
-        if B <= 1e-14 or nobs < 4:
+        if nobs < 4:
             result = NaN
+        # GH 42064 46431
+        # uniform case, force result to be -3.
+        elif num_consecutive_same_value >= nobs:
+            result = -3.
         else:
-            K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
-            result = K / ((dnobs - 2.) * (dnobs - 3.))
+            dnobs = <float64_t>nobs
+            A = x / dnobs
+            R = A * A
+            B = xx / dnobs - R
+            R = R * A
+            C = xxx / dnobs - R - 3 * A * B
+            R = R * A
+            D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
+
+            # #18044: with uniform distribution, floating issue will
+            #         cause B != 0. and cause the result is a very
+            #         large number.
+            #
+            #         in core/nanops.py nanskew/nankurt call the function
+            #         _zero_out_fperr(m2) to fix floating error.
+            #         if the variance is less than 1e-14, it could be
+            #         treat as zero, here we follow the original
+            #         skew/kurt behaviour to check B <= 1e-14
+            if B <= 1e-14:
+                result = NaN
+            else:
+                K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
+                result = K / ((dnobs - 2.) * (dnobs - 3.))
     else:
         result = NaN
 
@@ -671,7 +704,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
                           float64_t *compensation_x,
                           float64_t *compensation_xx,
                           float64_t *compensation_xxx,
-                          float64_t *compensation_xxxx) nogil:
+                          float64_t *compensation_xxxx,
+                          int64_t *num_consecutive_same_value,
+                          float64_t *prev_value
+                          ) nogil:
     """ add a value from the kurotic calc """
     cdef:
         float64_t y, t
@@ -697,6 +733,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
         compensation_xxxx[0] = t - xxxx[0] - y
         xxxx[0] = t
 
+        # GH#42064, record num of same values to remove floating point artifacts
+        if val == prev_value[0]:
+            num_consecutive_same_value[0] += 1
+        else:
+            # reset to 1 (include current value itself)
+            num_consecutive_same_value[0] = 1
+        prev_value[0] = val
+
 
 cdef inline void remove_kurt(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
@@ -741,7 +785,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
         float64_t compensation_xx_remove, compensation_xx_add
         float64_t compensation_x_remove, compensation_x_add
         float64_t x, xx, xxx, xxxx
-        int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
+        float64_t prev_value
+        int64_t nobs, s, e, num_consecutive_same_value
+        int64_t N = len(start), V = len(values), nobs_mean = 0
         ndarray[float64_t] output, values_copy
         bint is_monotonic_increasing_bounds
 
@@ -775,6 +821,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 compensation_xxxx_add = compensation_xxxx_remove = 0
                 compensation_xxx_remove = compensation_xxx_add = 0
                 compensation_xx_remove = compensation_xx_add = 0
@@ -784,7 +833,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(s, e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
                              &compensation_x_add, &compensation_xx_add,
-                             &compensation_xxx_add, &compensation_xxxx_add)
+                             &compensation_xxx_add, &compensation_xxxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -800,9 +850,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
                 for j in range(end[i - 1], e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
                              &compensation_x_add, &compensation_xx_add,
-                             &compensation_xxx_add, &compensation_xxxx_add)
+                             &compensation_xxx_add, &compensation_xxxx_add,
+                             &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
+            output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0

diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1860,3 +1860,14 @@ def test_rolling_mean_sum_floating_artifacts():
     assert (result[-3:] == 0).all()
     result = r.sum()
     assert (result[-3:] == 0).all()
+
+
+def test_rolling_skew_kurt_floating_artifacts():
+    # GH 42064 46431
+
+    sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
+    r = sr.rolling(4)
+    result = r.skew()
+    assert (result[-2:] == 0).all()
+    result = r.kurt()
+    assert (result[-2:] == -3).all()
diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py
@@ -178,17 +178,18 @@ def test_center_reindex_frame(frame, roll_func):
 
 def test_rolling_skew_edge_cases(step):
 
-    all_nan = Series([np.NaN] * 5)[::step]
-
+    expected = Series([np.NaN] * 4 + [0.0])[::step]
     # yields all NaN (0 variance)
     d = Series([1] * 5)
     x = d.rolling(window=5, step=step).skew()
-    tm.assert_series_equal(all_nan, x)
+    # index 4 should be 0 as it contains 5 same obs
+    tm.assert_series_equal(expected, x)
 
+    expected = Series([np.NaN] * 5)[::step]
     # yields all NaN (window too small)
     d = Series(np.random.randn(5))
     x = d.rolling(window=2, step=step).skew()
-    tm.assert_series_equal(all_nan, x)
+    tm.assert_series_equal(expected, x)
 
     # yields [NaN, NaN, NaN, 0.177994, 1.548824]
     d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
@@ -199,17 +200,18 @@ def test_rolling_skew_edge_cases(step):
 
 def test_rolling_kurt_edge_cases(step):
 
-    all_nan = Series([np.NaN] * 5)[::step]
+    expected = Series([np.NaN] * 4 + [-3.0])[::step]
 
     # yields all NaN (0 variance)
     d = Series([1] * 5)
     x = d.rolling(window=5, step=step).kurt()
-    tm.assert_series_equal(all_nan, x)
+    tm.assert_series_equal(expected, x)
 
     # yields all NaN (window too small)
+    expected = Series([np.NaN] * 5)[::step]
     d = Series(np.random.randn(5))
     x = d.rolling(window=3, step=step).kurt()
-    tm.assert_series_equal(all_nan, x)
+    tm.assert_series_equal(expected, x)
 
     # yields [NaN, NaN, NaN, 1.224307, 2.671499]
     d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
@@ -220,11 +222,15 @@ def test_rolling_kurt_edge_cases(step):
 
 def test_rolling_skew_eq_value_fperr(step):
     # #18804 all rolling skew for all equal values should return Nan
+    # #46717 update: all equal values should return 0 instead of NaN
     a = Series([1.1] * 15).rolling(window=10, step=step).skew()
-    assert np.isnan(a).all()
+    assert (a[a.index >= 9] == 0).all()
+    assert a[a.index < 9].isna().all()
 
 
 def test_rolling_kurt_eq_value_fperr(step):
     # #18804 all rolling kurt for all equal values should return Nan
+    # #46717 update: all equal values should return -3 instead of NaN
     a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
-    assert np.isnan(a).all()
+    assert (a[a.index >= 9] == -3).all()
+    assert a[a.index < 9].isna().all()