From cce2dbe0c2f48fffab68b85a166bd505127a4eda Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Fri, 15 Sep 2023 22:25:26 +0100 Subject: [PATCH 1/4] Fix weighted rolling variance implementation --- pandas/_libs/window/aggregations.pyx | 128 ++++++++------------------- pandas/tests/window/test_win_type.py | 15 ++++ 2 files changed, 52 insertions(+), 91 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..23e66eefae14e 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1558,7 +1558,7 @@ cdef float64_t calc_weighted_var(float64_t t, if nobs == 1: result = 0 else: - result = t * win_n / ((win_n - ddof) * sum_w) + result = t * nobs / ((nobs - ddof) * sum_w) if result < 0: result = 0 else: @@ -1599,68 +1599,20 @@ cdef void add_weighted_var(float64_t val, cdef: float64_t temp, q, r - if val != val: - return - nobs[0] = nobs[0] + 1 - q = val - mean[0] - temp = sum_w[0] + w - r = q * w / temp - - mean[0] = mean[0] + r - t[0] = t[0] + r * sum_w[0] * q - sum_w[0] = temp - - -cdef void remove_weighted_var(float64_t val, - float64_t w, - float64_t *t, - float64_t *sum_w, - float64_t *mean, - float64_t *nobs) noexcept nogil: - """ - Update weighted mean, sum of weights and sum of weighted squared - differences to remove value and weight pair from weighted variance - calculation using West's method. - - Paper: https://dl.acm.org/citation.cfm?id=359153 - - Parameters - ---------- - val: float64_t - window values - w: float64_t - window weights - t: float64_t - sum of weighted squared differences - sum_w: float64_t - sum of weights - mean: float64_t - weighted mean - nobs: float64_t - number of observations - """ - - cdef: - float64_t temp, q, r - - if val == val: - nobs[0] = nobs[0] - 1 - - if nobs[0]: - q = val - mean[0] - temp = sum_w[0] - w - r = q * w / temp + if nobs[0] == 1: + sum_w[0] = w + mean[0] = val - mean[0] = mean[0] - r - t[0] = t[0] - r * sum_w[0] * q - sum_w[0] = temp + else: + q = val - mean[0] + temp = sum_w[0] + w + r = q * w / temp - else: - t[0] = 0 - sum_w[0] = 0 - mean[0] = 0 + mean[0] = mean[0] + r + t[0] = t[0] + r * sum_w[0] * q + sum_w[0] = temp def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, @@ -1690,44 +1642,38 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, """ cdef: - float64_t t = 0, sum_w = 0, mean = 0, nobs = 0 - float64_t val, pre_val, w, pre_w - Py_ssize_t i, n, win_n - float64_t[:] output + float64_t val, w + Py_ssize_t in_i, win_i, add_i, n, win_n + float64_t[:] output, t, mean, sum_w, nobs n = len(values) win_n = len(weights) + output = np.empty(n, dtype=np.float64) + t = np.zeros(n, dtype=np.float64) + mean = np.zeros(n, dtype=np.float64) + sum_w = np.zeros(n, dtype=np.float64) + nobs = np.zeros(n, dtype=np.float64) with nogil: - - for i in range(min(win_n, n)): - add_weighted_var(values[i], weights[i], &t, - &sum_w, &mean, &nobs) - - output[i] = calc_weighted_var(t, sum_w, win_n, - ddof, nobs, minp) - - for i in range(win_n, n): - val = values[i] - pre_val = values[i - win_n] - - w = weights[i % win_n] - pre_w = weights[(i - win_n) % win_n] - - if val == val: - if pre_val == pre_val: - remove_weighted_var(pre_val, pre_w, &t, - &sum_w, &mean, &nobs) - - add_weighted_var(val, w, &t, &sum_w, &mean, &nobs) - - elif pre_val == pre_val: - remove_weighted_var(pre_val, pre_w, &t, - &sum_w, &mean, &nobs) - - output[i] = calc_weighted_var(t, sum_w, win_n, - ddof, nobs, minp) + for win_i in range(win_n): + w = weights[win_i] + if w != w: + continue + + for in_i in range(n - (win_n - win_i) + 1): + val = values[in_i] + + if val == val: + add_i = in_i + (win_n - win_i) - 1 + add_weighted_var( + val, w, &t[add_i], &sum_w[add_i], &mean[add_i], &nobs[add_i] + ) + + for in_i in range(n): + output[in_i] = calc_weighted_var( + t[in_i], sum_w[in_i], win_n, ddof, nobs[in_i], minp + ) return output diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 5052019ddb726..ef9724644755a 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -661,6 +661,21 @@ def test_cmov_window_special_linear_range(win_types_special, step): tm.assert_series_equal(xp, rs) +@pytest.mark.parametrize("size", [10, 100, 1000]) +@pytest.mark.parametrize("scale", [0, 0.1, 0.01, 0.001, 0.0001]) +def test_weighted_std_through_mean(win_types, step, scale, size): + # GH 53273 + s = Series(np.random.default_rng(0).normal(loc=1, scale=scale, size=size)) + + squared_mean = (s**2).rolling(5, win_type=win_types).mean() + mean = s.rolling(5, win_type=win_types).mean() + expected = (squared_mean - mean**2).pow(0.5) + + result = s.rolling(5, win_type=win_types).std(ddof=0) + + tm.assert_series_equal(expected, result) + + def test_weighted_var_big_window_no_segfault(win_types, center): # GitHub Issue #46772 pytest.importorskip("scipy") From b50eedfd6817c668959c2188223611864c974d3b Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Fri, 15 Sep 2023 22:36:26 +0100 Subject: [PATCH 2/4] Add whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 03b69b53836ad..cade3397b6e80 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -191,6 +191,7 @@ Bug fixes - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` where implementation of the algorithm was incorrect (:issue:`53273` and :issue:`54333`) Categorical ^^^^^^^^^^^ From d22266ae5df390ede02cfb3e752f53a72c290bd3 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Sat, 16 Sep 2023 00:12:05 +0100 Subject: [PATCH 3/4] Add importorskip/window --- pandas/tests/window/test_win_type.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index ef9724644755a..d325db0650c68 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -665,13 +665,15 @@ def test_cmov_window_special_linear_range(win_types_special, step): @pytest.mark.parametrize("scale", [0, 0.1, 0.01, 0.001, 0.0001]) def test_weighted_std_through_mean(win_types, step, scale, size): # GH 53273 + pytest.importorskip("scipy") + window = 5 s = Series(np.random.default_rng(0).normal(loc=1, scale=scale, size=size)) - squared_mean = (s**2).rolling(5, win_type=win_types).mean() - mean = s.rolling(5, win_type=win_types).mean() + squared_mean = (s**2).rolling(window, win_type=win_types).mean() + mean = s.rolling(window, win_type=win_types).mean() expected = (squared_mean - mean**2).pow(0.5) - result = s.rolling(5, win_type=win_types).std(ddof=0) + result = s.rolling(window, win_type=win_types).std(ddof=0) tm.assert_series_equal(expected, result) From dc16ff84aae92e6d8d07e92f26c02a67af74f0e9 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Mon, 18 Sep 2023 22:50:44 +0100 Subject: [PATCH 4/4] Calculate variance one window at a time --- pandas/_libs/window/aggregations.pyx | 33 ++++++++++------------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 23e66eefae14e..6617945fee768 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1604,6 +1604,7 @@ cdef void add_weighted_var(float64_t val, if nobs[0] == 1: sum_w[0] = w mean[0] = val + t[0] = 0 else: q = val - mean[0] @@ -1642,38 +1643,26 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, """ cdef: + float64_t t, sum_w, mean, nobs float64_t val, w - Py_ssize_t in_i, win_i, add_i, n, win_n - float64_t[:] output, t, mean, sum_w, nobs + Py_ssize_t in_i, win_i, n, win_n + float64_t[:] output n = len(values) win_n = len(weights) output = np.empty(n, dtype=np.float64) - t = np.zeros(n, dtype=np.float64) - mean = np.zeros(n, dtype=np.float64) - sum_w = np.zeros(n, dtype=np.float64) - nobs = np.zeros(n, dtype=np.float64) - with nogil: - for win_i in range(win_n): + for in_i in range(n): + nobs = 0 + for win_i in range(max(0, win_n - in_i - 1), win_n): w = weights[win_i] - if w != w: - continue - - for in_i in range(n - (win_n - win_i) + 1): - val = values[in_i] + val = values[win_i - win_n + in_i + 1] - if val == val: - add_i = in_i + (win_n - win_i) - 1 - add_weighted_var( - val, w, &t[add_i], &sum_w[add_i], &mean[add_i], &nobs[add_i] - ) + if val == val and w == w: + add_weighted_var(val, w, &t, &sum_w, &mean, &nobs) - for in_i in range(n): - output[in_i] = calc_weighted_var( - t[in_i], sum_w[in_i], win_n, ddof, nobs[in_i], minp - ) + output[in_i] = calc_weighted_var(t, sum_w, win_n, ddof, nobs, minp) return output