Skip to content

Commit 79f699a

Browse files
committed
ENH: Ensure that rolling_var of identical values is exactly zero
Add a check to rolling_var for repeated observations, in order to produce an exactly zero value of the variance when all entries are identical. Related to the discussion in #7900
1 parent 35a9527 commit 79f699a

File tree

2 files changed

+60
-65
lines changed

2 files changed

+60
-65
lines changed

pandas/algos.pyx

+44-56
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
10871087
sum_wt = 1.
10881088
sum_wt2 = 1.
10891089
old_wt = 1.
1090-
1090+
10911091
for i from 1 <= i < N:
10921092
cur_x = input_x[i]
10931093
cur_y = input_y[i]
@@ -1117,7 +1117,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
11171117
elif is_observation:
11181118
mean_x = cur_x
11191119
mean_y = cur_y
1120-
1120+
11211121
if nobs >= minp:
11221122
if not bias:
11231123
numerator = sum_wt * sum_wt
@@ -1259,84 +1259,72 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
12591259
"""
12601260
Numerically stable implementation using Welford's method.
12611261
"""
1262-
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
1263-
cdef Py_ssize_t i
1264-
cdef Py_ssize_t N = len(input)
1262+
cdef double val, prev, mean_x = 0, ssqdm_x = 0, delta, rep = NaN, out
1263+
cdef Py_ssize_t nobs = 0, nrep = 0, i, N = len(input)
12651264

12661265
cdef ndarray[double_t] output = np.empty(N, dtype=float)
12671266

12681267
minp = _check_minp(win, minp, N)
12691268

1270-
# Check for windows larger than array, addresses #7297
1271-
win = min(win, N)
1272-
1273-
# Over the first window, observations can only be added, never removed
1274-
for i from 0 <= i < win:
1269+
for i from 0 <= i < N:
12751270
val = input[i]
1276-
1277-
# Not NaN
1271+
prev = NaN if i < win else input[i - win]
1272+
1273+
# First, count the number of observations and consecutive repeats
1274+
if prev == prev:
1275+
# prev is not NaN, removing an observation...
1276+
if nobs == nrep:
1277+
# ...and removing a repeat
1278+
nrep -= 1
1279+
if nrep == 0:
1280+
rep = NaN
1281+
nobs -= 1
12781282
if val == val:
1279-
nobs += 1
1280-
delta = (val - mean_x)
1281-
mean_x += delta / nobs
1282-
ssqdm_x += delta * (val - mean_x)
1283-
1284-
if (nobs >= minp) and (nobs > ddof):
1285-
#pathological case
1286-
if nobs == 1:
1287-
val = 0
1283+
# next is not NaN, adding an observation...
1284+
if val == rep:
1285+
# ...and adding a repeat
1286+
nrep += 1
12881287
else:
1289-
val = ssqdm_x / (nobs - ddof)
1290-
if val < 0:
1291-
val = 0
1292-
else:
1293-
val = NaN
1294-
1295-
output[i] = val
1296-
1297-
# After the first window, observations can both be added and removed
1298-
for i from win <= i < N:
1299-
val = input[i]
1300-
prev = input[i - win]
1288+
# ...and resetting repeats
1289+
nrep = 1
1290+
rep = val
1291+
nobs += 1
13011292

1302-
if val == val:
1293+
# Then, compute the new mean and sum of squared differences
1294+
if nobs == nrep:
1295+
# All non-NaN values in window are identical...
1296+
ssqdm_x = 0
1297+
mean_x = rep if nobs > 0 else 0
1298+
elif val == val:
1299+
# Adding one observation...
13031300
if prev == prev:
1304-
# Adding one observation and removing another one
1301+
# ...and removing another
13051302
delta = val - prev
13061303
prev -= mean_x
13071304
mean_x += delta / nobs
13081305
val -= mean_x
13091306
ssqdm_x += (val + prev) * delta
13101307
else:
1311-
# Adding one observation and not removing any
1312-
nobs += 1
1313-
delta = (val - mean_x)
1308+
# ...and not removing any
1309+
delta = val - mean_x
13141310
mean_x += delta / nobs
13151311
ssqdm_x += delta * (val - mean_x)
13161312
elif prev == prev:
13171313
# Adding no new observation, but removing one
1318-
nobs -= 1
1319-
if nobs:
1320-
delta = (prev - mean_x)
1321-
mean_x -= delta / nobs
1322-
ssqdm_x -= delta * (prev - mean_x)
1323-
else:
1324-
mean_x = 0
1325-
ssqdm_x = 0
1314+
delta = prev - mean_x
1315+
mean_x -= delta / nobs
1316+
ssqdm_x -= delta * (prev - mean_x)
13261317
# Variance is unchanged if no observation is added or removed
13271318

1328-
if (nobs >= minp) and (nobs > ddof):
1329-
#pathological case
1330-
if nobs == 1:
1331-
val = 0
1332-
else:
1333-
val = ssqdm_x / (nobs - ddof)
1334-
if val < 0:
1335-
val = 0
1319+
# Finally, compute and write the rolling variance to the output array
1320+
if nobs >= minp and nobs > ddof:
1321+
out = ssqdm_x / (nobs - ddof)
1322+
if out < 0:
1323+
out = 0
13361324
else:
1337-
val = NaN
1325+
out = NaN
13381326

1339-
output[i] = val
1327+
output[i] = out
13401328

13411329
return output
13421330

pandas/stats/tests/test_moments.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
import numpy as np
88

99
from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
10-
from pandas.util.testing import (
11-
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
12-
)
10+
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
11+
assert_frame_equal, assert_panel_equal,
12+
assert_index_equal,)
1313
import pandas.core.datetools as datetools
1414
import pandas.stats.moments as mom
1515
import pandas.util.testing as tm
@@ -524,7 +524,7 @@ def test_ewma(self):
524524
self.assertTrue(np.abs(result - 1) < 1e-2)
525525

526526
s = Series([1.0, 2.0, 4.0, 8.0])
527-
527+
528528
expected = Series([1.0, 1.6, 2.736842, 4.923077])
529529
for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
530530
lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
@@ -750,7 +750,7 @@ def _non_null_values(x):
750750

751751
for (std, var, cov) in [(std_biased, var_biased, cov_biased),
752752
(std_unbiased, var_unbiased, cov_unbiased)]:
753-
753+
754754
# check that var(x), std(x), and cov(x) are all >= 0
755755
var_x = var(x)
756756
std_x = std(x)
@@ -762,7 +762,7 @@ def _non_null_values(x):
762762

763763
# check that var(x) == cov(x, x)
764764
assert_equal(var_x, cov_x_x)
765-
765+
766766
# check that var(x) == std(x)^2
767767
assert_equal(var_x, std_x * std_x)
768768

@@ -796,7 +796,7 @@ def _non_null_values(x):
796796
cov_x_y = cov(x, y)
797797
cov_y_x = cov(y, x)
798798
assert_equal(cov_x_y, cov_y_x)
799-
799+
800800
# check that cov(x, y) == (var(x+y) - var(x) - var(y)) / 2
801801
var_x_plus_y = var(x + y)
802802
var_y = var(y)
@@ -1007,7 +1007,7 @@ def test_rolling_consistency(self):
10071007
expected.iloc[:, i, j] = rolling_f(x.iloc[:, i], x.iloc[:, j],
10081008
window=window, min_periods=min_periods, center=center)
10091009
assert_panel_equal(rolling_f_result, expected)
1010-
1010+
10111011
# binary moments
10121012
def test_rolling_cov(self):
10131013
A = self.series
@@ -1432,7 +1432,7 @@ def test_expanding_corr_pairwise_diff_length(self):
14321432
assert_frame_equal(result2, expected)
14331433
assert_frame_equal(result3, expected)
14341434
assert_frame_equal(result4, expected)
1435-
1435+
14361436
def test_pairwise_stats_column_names_order(self):
14371437
# GH 7738
14381438
df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
@@ -1731,6 +1731,13 @@ def test_rolling_median_how_resample(self):
17311731
x = mom.rolling_median(series, window=1, freq='D')
17321732
assert_series_equal(expected, x)
17331733

1734+
def test_rolling_var_exactly_zero(self):
1735+
# Related to GH 7900
1736+
a = np.array([1, 2, 3, 3, 3, 2, 2, 2, np.nan, 2, np.nan, 2, np.nan,
1737+
np.nan, np.nan, 1, 1, 1])
1738+
v = mom.rolling_var(a, window=3, min_periods=2)
1739+
self.assertTrue(np.all(v[[4, 7, 8, 9, 11, 16, 17]] == 0))
1740+
17341741
if __name__ == '__main__':
17351742
import nose
17361743
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)