pandas-dev · mroeschke · Apr 19, 2022 · Mar 25, 2022 · Mar 25, 2022 · Mar 26, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -95,6 +95,9 @@ Other enhancements
 - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
 - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
 - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
+- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`)
+- :meth:`DataFrame.rolling.skew`, :meth:`Series.rolling.skew` now generate correct result 0 for window containing same values (:issue:`42064`)
+- :meth:`DataFrame.rolling.kurt`, :meth:`Series.rolling.kurt` now generate correct result -3 for window containing same values (:issue:`42064`, :issue:`30993`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:

@@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
 
 
 cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
-                               float64_t ssqdm_x) nogil:
+                               float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil:
     cdef:
         float64_t result
 
     # Variance is unchanged if no observation is added or removed
     if (nobs >= minp) and (nobs > ddof):
 
-        # pathological case
-        if nobs == 1:
+        # pathological case & repeatedly same values case
+        if nobs == 1 or num_consecutive_same_value >= nobs:
             result = 0
         else:
             result = ssqdm_x / (nobs - <float64_t>ddof)
@@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
 
 
 cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
-                         float64_t *ssqdm_x, float64_t *compensation) nogil:
+                         float64_t *ssqdm_x, float64_t *compensation,
+                         int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
     """ add a value from the var calc """
     cdef:
         float64_t delta, prev_mean, y, t
@@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
         return
 
     nobs[0] = nobs[0] + 1
+
+    # GH#42064, record num of same values to remove floating point artifacts
+    if val == prev_value[0]:
+        num_consecutive_same_value[0] += 1
+    else:
+        # reset to 1 (include current value itself)
+        num_consecutive_same_value[0] = 1
+    prev_value[0] = val
+
     # Welford's method for the online variance-calculation
     # using Kahan summation
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
@@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
     """
     cdef:
         float64_t mean_x, ssqdm_x, nobs, compensation_add,
-        float64_t compensation_remove,
-        float64_t val, prev, delta, mean_x_old
-        int64_t s, e
+        float64_t compensation_remove, prev_value
+        int64_t s, e, num_consecutive_same_value
         Py_ssize_t i, j, N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
                 for j in range(s, e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
+                            &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
+                            &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
+            output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0.0

diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py
@@ -81,7 +81,7 @@ def sliding_sum(
                     val, nobs, sum_x, compensation_add
                 )
 
-        if nobs == 0 == nobs:
+        if nobs == 0 == min_periods:
             result = 0.0
         elif nobs >= min_periods:
             result = sum_x

diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py
@@ -16,9 +16,22 @@
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
 def add_var(
-    val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
-) -> tuple[int, float, float, float]:
+    val: float,
+    nobs: int,
+    mean_x: float,
+    ssqdm_x: float,
+    compensation: float,
+    num_consecutive_same_value: int,
+    prev_value: float,
+) -> tuple[int, float, float, float, int, float]:
     if not np.isnan(val):
+
+        if val == prev_value:
+            num_consecutive_same_value += 1
+        else:
+            num_consecutive_same_value = 1
+        prev_value = val
+
         nobs += 1
         prev_mean = mean_x - compensation
         y = val - compensation
@@ -30,7 +43,7 @@ def add_var(
         else:
             mean_x = 0
         ssqdm_x += (val - prev_mean) * (val - mean_x)
-    return nobs, mean_x, ssqdm_x, compensation
+    return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
 
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
@@ -79,10 +92,27 @@ def sliding_var(
         s = start[i]
         e = end[i]
         if i == 0 or not is_monotonic_increasing_bounds:
+
+            prev_value = values[s]
+            num_consecutive_same_value = 0
+
             for j in range(s, e):
                 val = values[j]
-                nobs, mean_x, ssqdm_x, compensation_add = add_var(
-                    val, nobs, mean_x, ssqdm_x, compensation_add
+                (
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
+                ) = add_var(
+                    val,
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
                 )
         else:
             for j in range(start[i - 1], s):
@@ -93,12 +123,25 @@ def sliding_var(
 
             for j in range(end[i - 1], e):
                 val = values[j]
-                nobs, mean_x, ssqdm_x, compensation_add = add_var(
-                    val, nobs, mean_x, ssqdm_x, compensation_add
+                (
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
+                ) = add_var(
+                    val,
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
                 )
 
         if nobs >= min_periods and nobs > ddof:
-            if nobs == 1:
+            if nobs == 1 or num_consecutive_same_value >= nobs:
                 result = 0.0
             else:
                 result = ssqdm_x / (nobs - ddof)

diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -2150,24 +2150,21 @@ def median(
         The default ``ddof`` of 1 used in :meth:`Series.std` is different
         than the default ``ddof`` of 0 in :func:`numpy.std`.
 
-        A minimum of one period is required for the rolling calculation.
-
-        The implementation is susceptible to floating point imprecision as
-        shown in the example below.\n
+        A minimum of one period is required for the rolling calculation.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).std()
-        0             NaN
-        1             NaN
-        2    5.773503e-01
-        3    1.000000e+00
-        4    1.000000e+00
-        5    1.154701e+00
-        6    2.580957e-08
+        0         NaN
+        1         NaN
+        2    0.577350
+        3    1.000000
+        4    1.000000
+        5    1.154701
+        6    0.000000
         dtype: float64
         """
         ).replace("\n", "", 1),
@@ -2212,24 +2209,21 @@ def std(
         The default ``ddof`` of 1 used in :meth:`Series.var` is different
         than the default ``ddof`` of 0 in :func:`numpy.var`.
 
-        A minimum of one period is required for the rolling calculation.
-
-        The implementation is susceptible to floating point imprecision as
-        shown in the example below.\n
+        A minimum of one period is required for the rolling calculation.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).var()
-        0             NaN
-        1             NaN
-        2    3.333333e-01
-        3    1.000000e+00
-        4    1.000000e+00
-        5    1.333333e+00
-        6    6.661338e-16
+        0         NaN
+        1         NaN
+        2    0.333333
+        3    1.000000
+        4    1.000000
+        5    1.333333
+        6    0.000000
         dtype: float64
         """
         ).replace("\n", "", 1),

diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -79,7 +79,20 @@ def f(x, *args):
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
-        "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]
+        "data",
+        [
+            DataFrame(np.eye(5)),
+            DataFrame(
+                [
+                    [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
+                    [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
+                    [np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5],
+                ]
+            ).T,
+            Series(range(5), name="foo"),
+            Series([20, 10, 10, np.inf, 1, 1, 2, 3]),
+            Series([20, 10, 10, np.nan, 10, 1, 2, 3]),
+        ],
     )
     def test_numba_vs_cython_rolling_methods(
         self,
@@ -95,7 +108,7 @@ def test_numba_vs_cython_rolling_methods(
 
         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
 
-        roll = data.rolling(2, step=step)
+        roll = data.rolling(3, step=step)
         result = getattr(roll, method)(
             engine="numba", engine_kwargs=engine_kwargs, **kwargs
         )