Skip to content

ENH: numerically stable rolling_skew and rolling_kurt #8424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 83 additions & 122 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1331,144 +1331,105 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):

return output

#----------------------------------------------------------------------
# Rolling skewness and kurtosis

#-------------------------------------------------------------------------------
# Rolling skewness
@cython.boundscheck(False)
@cython.wraparound(False)
def roll_skew(ndarray[double_t] input, int win, int minp):
def roll_higher_moment(ndarray[double_t] input, int win, int minp, bint kurt):
"""
Numerically stable implementation of skewness and kurtosis using a
Welford-like method. If `kurt` is True, rolling kurtosis is computed,
if False, rolling skewness.
"""
cdef double val, prev
cdef double x = 0, xx = 0, xxx = 0
cdef Py_ssize_t nobs = 0, i
cdef Py_ssize_t N = len(input)
cdef double mean_x = 0, s2dm_x = 0, s3dm_x = 0, s4dm_x = 0, rep = NaN
cdef double delta, delta_n, tmp
cdef Py_ssize_t i, nobs = 0, nrep = 0, N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

# 3 components of the skewness equation
cdef double A, B, C, R

minp = _check_minp(win, minp, N)
with nogil:
for i from 0 <= i < minp - 1:
val = input[i]
minobs = max(minp, 4 if kurt else 3)

# Not NaN
if val == val:
nobs += 1
x += val
xx += val * val
xxx += val * val * val

output[i] = NaN

for i from minp - 1 <= i < N:
val = input[i]

if val == val:
nobs += 1
x += val
xx += val * val
xxx += val * val * val

if i > win - 1:
prev = input[i - win]
if prev == prev:
x -= prev
xx -= prev * prev
xxx -= prev * prev * prev

nobs -= 1
if nobs >= minp:
A = x / nobs
B = xx / nobs - A * A
C = xxx / nobs - A * A * A - 3 * A * B
if B <= 0 or nobs < 3:
output[i] = NaN
for i from 0 <= i < N:
val = input[i]
prev = NaN if i < win else input[i - win]

if prev == prev:
# prev is not NaN, remove an observation...
nobs -= 1
if nobs < nrep:
# ...all non-NaN values were identical, remove a repeat
nrep -= 1
if nobs == nrep:
# We can get here both if all non-NaN were already identical
# or if nobs == 1 after removing the observation
if nrep == 0:
rep = NaN
mean_x = 0
else:
R = sqrt(B)
output[i] = ((sqrt(nobs * (nobs - 1.)) * C) /
((nobs-2) * R * R * R))
mean_x = rep
# This is redundant most of the time
s2dm_x = s3dm_x = s4dm_x = 0
else:
output[i] = NaN

return output

#-------------------------------------------------------------------------------
# Rolling kurtosis
@cython.boundscheck(False)
@cython.wraparound(False)
def roll_kurt(ndarray[double_t] input,
int win, int minp):
cdef double val, prev
cdef double x = 0, xx = 0, xxx = 0, xxxx = 0
cdef Py_ssize_t nobs = 0, i
cdef Py_ssize_t N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

# 5 components of the kurtosis equation
cdef double A, B, C, D, R, K

minp = _check_minp(win, minp, N)
with nogil:
for i from 0 <= i < minp - 1:
val = input[i]

# Not NaN
if val == val:
nobs += 1

# seriously don't ask me why this is faster
x += val
xx += val * val
xxx += val * val * val
xxxx += val * val * val * val

output[i] = NaN

for i from minp - 1 <= i < N:
val = input[i]

if val == val:
nobs += 1
x += val
xx += val * val
xxx += val * val * val
xxxx += val * val * val * val

if i > win - 1:
prev = input[i - win]
if prev == prev:
x -= prev
xx -= prev * prev
xxx -= prev * prev * prev
xxxx -= prev * prev * prev * prev

nobs -= 1

if nobs >= minp:
A = x / nobs
R = A * A
B = xx / nobs - R
R = R * A
C = xxx / nobs - R - 3 * A * B
R = R * A
D = xxxx / nobs - R - 6*B*A*A - 4*C*A

if B == 0 or nobs < 4:
output[i] = NaN

else:
K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2)
K = K / ((nobs - 2.)*(nobs-3.))

output[i] = K
# ...update mean and sums of raised differences from mean
delta = prev - mean_x
delta_n = delta / nobs
tmp = delta * delta_n * (nobs + 1)
if kurt:
s4dm_x -= ((tmp * ((nobs + 3) * nobs + 3) -
6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n
s3dm_x -= (tmp * (nobs + 2) - 3 * s2dm_x) * delta_n
s2dm_x -= tmp
mean_x -= delta_n

if val == val:
# val is not NaN, adding an observation...
nobs += 1
if val == rep:
# ...and adding a repeat
nrep += 1
else:
output[i] = NaN
# ...and resetting repeats
nrep = 1
rep = val
if nobs == nrep:
# ...all non-NaN values are identical
mean_x = rep
s2dm_x = s3dm_x = s4dm_x = 0
else:
# ...update mean and sums of raised differences from mean
delta = val - mean_x
delta_n = delta / nobs
tmp = delta * delta_n * (nobs - 1)
if kurt:
s4dm_x += ((tmp * ((nobs - 3) * nobs + 3) +
6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n
s3dm_x += (tmp * (nobs - 2) - 3 * s2dm_x) * delta_n
s2dm_x += tmp
mean_x += delta_n

# Sums of even powers must be positive
if s2dm_x < 0 or s4dm_x < 0:
s2dm_x = s3dm_x = s4_dm_x = 0

if nobs < minobs or s2dm_x == 0:
output[i] = NaN
elif kurt:
# multiplications are cheap, divisions are not
tmp = s2dm_x * s2dm_x
output[i] = (nobs - 1) * (nobs * (nobs + 1) * s4dm_x -
3 * (nobs - 1) * tmp)
output[i] /= tmp * (nobs - 2) * (nobs - 3)
else:
# multiplications are cheap, divisions and square roots are not
tmp = (nobs - 2) * (nobs - 2) * s2dm_x * s2dm_x * s2dm_x
output[i] = s3dm_x * nobs * sqrt((nobs - 1) / tmp)

return output


#-------------------------------------------------------------------------------
# Rolling median, min, max

Expand Down
52 changes: 38 additions & 14 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,8 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None,


def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
how=None, args=(), kwargs={}, **kwds):
how=None, args=(), kwargs={}, center_data=False,
norm_data=False, **kwds):
"""
Rolling statistical measure using supplied function. Designed to be
used with passed-in Cython array-based functions.
Expand All @@ -378,15 +379,21 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
Passed on to func
kwargs : dict
Passed on to func
center_data : bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put the defaults here (after bool),e .g. center_data : bool, default False

If True, subtract the mean of the data from the values
norm_data: bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a versionadded directiver here

If True, subtract the mean of the data from the values, and divide
by their standard deviation.

Returns
-------
y : type of input
"""
arg = _conv_timerule(arg, freq, how)

return_hook, values = _process_data_structure(arg)

return_hook, values = _process_data_structure(arg,
center_data=center_data,
norm_data=norm_data)
if values.size == 0:
result = values.copy()
else:
Expand Down Expand Up @@ -423,7 +430,8 @@ def _center_window(rs, window, axis):
return rs


def _process_data_structure(arg, kill_inf=True):
def _process_data_structure(arg, kill_inf=True, center_data=False,
norm_data=False):
if isinstance(arg, DataFrame):
return_hook = lambda v: type(arg)(v, index=arg.index,
columns=arg.columns)
Expand All @@ -438,9 +446,15 @@ def _process_data_structure(arg, kill_inf=True):
if not issubclass(values.dtype.type, float):
values = values.astype(float)

if kill_inf:
if kill_inf or center_data or norm_data:
values = values.copy()
values[np.isinf(values)] = np.NaN
mask = np.isfinite(values)
if kill_inf:
values[~mask] = np.NaN
if center_data or norm_data:
values -= np.mean(values[mask])
if norm_data:
values /= np.std(values[mask])

return return_hook, values

Expand Down Expand Up @@ -629,7 +643,8 @@ def _use_window(minp, window):
return minp


def _rolling_func(func, desc, check_minp=_use_window, how=None, additional_kw=''):
def _rolling_func(func, desc, check_minp=_use_window, how=None,
additional_kw='', center_data=False, norm_data=False):
if how is None:
how_arg_str = 'None'
else:
Expand All @@ -645,7 +660,8 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
minp = check_minp(minp, window)
return func(arg, window, minp, **kwds)
return _rolling_moment(arg, window, call_cython, min_periods, freq=freq,
center=center, how=how, **kwargs)
center=center, how=how, center_data=center_data,
norm_data=norm_data, **kwargs)

return f

Expand All @@ -657,16 +673,24 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
how='median')

_ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw))
def _roll_skew(*args, **kwargs):
kwargs['kurt'] = False
return algos.roll_higher_moment(*args, **kwargs)
def _roll_kurt(*args, **kwargs):
kwargs['kurt'] = True
return algos.roll_higher_moment(*args, **kwargs)
rolling_std = _rolling_func(_ts_std, 'Moving standard deviation.',
check_minp=_require_min_periods(1),
additional_kw=_ddof_kw)
rolling_var = _rolling_func(algos.roll_var, 'Moving variance.',
check_minp=_require_min_periods(1),
additional_kw=_ddof_kw)
rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.',
check_minp=_require_min_periods(3))
rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.',
check_minp=_require_min_periods(4))
rolling_skew = _rolling_func(_roll_skew, 'Unbiased moving skewness.',
check_minp=_require_min_periods(3),
center_data=True, norm_data=False)
rolling_kurt = _rolling_func(_roll_kurt, 'Unbiased moving kurtosis.',
check_minp=_require_min_periods(4),
center_data=True, norm_data=True)


def rolling_quantile(arg, window, quantile, min_periods=None, freq=None,
Expand Down Expand Up @@ -903,9 +927,9 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
expanding_var = _expanding_func(algos.roll_var, 'Expanding variance.',
check_minp=_require_min_periods(1),
additional_kw=_ddof_kw)
expanding_skew = _expanding_func(algos.roll_skew, 'Unbiased expanding skewness.',
expanding_skew = _expanding_func(_roll_skew, 'Unbiased expanding skewness.',
check_minp=_require_min_periods(3))
expanding_kurt = _expanding_func(algos.roll_kurt, 'Unbiased expanding kurtosis.',
expanding_kurt = _expanding_func(_roll_kurt, 'Unbiased expanding kurtosis.',
check_minp=_require_min_periods(4))


Expand Down
Loading