Skip to content

ENH: Added a min_count keyword to stat funcs #18876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 39 additions & 12 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def get_dispatch(dtypes):
def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=1):
"""
Only aggregates on axis=0
"""
Expand Down Expand Up @@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this work for min_count==0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, sumx starts out as zeros, so we just have to avoid setting it to NaN. Same for prod, but with ones.

out[i, j] = NAN
else:
out[i, j] = sumx[i, j]
Expand All @@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=1):
"""
Only aggregates on axis=0
"""
Expand Down Expand Up @@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
out[i, j] = NAN
else:
out[i, j] = prodx[i, j]
Expand All @@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
{{dest_type2}} val, ct, oldmean
ndarray[{{dest_type2}}, ndim=2] nobs, mean

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

Expand Down Expand Up @@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
{{dest_type2}} val, count
ndarray[{{dest_type2}}, ndim=2] sumx, nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

Expand Down Expand Up @@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
Expand All @@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
{{dest_type2}} val, count
Py_ssize_t ngroups = len(counts)

assert min_count == -1, "'min_count' only used in add and prod"

if len(labels) == 0:
return

Expand Down Expand Up @@ -332,7 +343,8 @@ def get_dispatch(dtypes):
def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
Expand All @@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] resx
ndarray[int64_t, ndim=2] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

Expand Down Expand Up @@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{c_type}}, ndim=2] values,
ndarray[int64_t] labels, int64_t rank):
ndarray[int64_t] labels, int64_t rank,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
Expand All @@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] resx
ndarray[int64_t, ndim=2] nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

Expand Down Expand Up @@ -455,7 +472,8 @@ def get_dispatch(dtypes):
def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
Expand All @@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
{{dest_type2}} val, count
ndarray[{{dest_type2}}, ndim=2] maxx, nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

Expand Down Expand Up @@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[int64_t] counts,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
Expand All @@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
{{dest_type2}} val, count
ndarray[{{dest_type2}}, ndim=2] minx, nobs

assert min_count == -1, "'min_count' only used in add and prod"

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

Expand Down Expand Up @@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels):
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
Expand All @@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] _counts
ndarray data
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"

ngroups = len(counts)
N, K = (<object> values).shape

Expand Down
104 changes: 96 additions & 8 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7322,7 +7322,8 @@ def _add_numeric_operations(cls):
@Substitution(outname='mad',
desc="Return the mean absolute deviation of the values "
"for the requested axis",
name1=name, name2=name2, axis_descr=axis_descr)
name1=name, name2=name2, axis_descr=axis_descr,
min_count='', examples='')
@Appender(_num_doc)
def mad(self, axis=None, skipna=None, level=None):
if skipna is None:
Expand Down Expand Up @@ -7363,7 +7364,8 @@ def mad(self, axis=None, skipna=None, level=None):
@Substitution(outname='compounded',
desc="Return the compound percentage of the values for "
"the requested axis", name1=name, name2=name2,
axis_descr=axis_descr)
axis_descr=axis_descr,
min_count='', examples='')
@Appender(_num_doc)
def compound(self, axis=None, skipna=None, level=None):
if skipna is None:
Expand All @@ -7387,10 +7389,10 @@ def compound(self, axis=None, skipna=None, level=None):
lambda y, axis: np.maximum.accumulate(y, axis), "max",
-np.inf, np.nan)

cls.sum = _make_stat_function(
cls.sum = _make_min_count_stat_function(
cls, 'sum', name, name2, axis_descr,
'Return the sum of the values for the requested axis',
nanops.nansum)
nanops.nansum, _sum_examples)
cls.mean = _make_stat_function(
cls, 'mean', name, name2, axis_descr,
'Return the mean of the values for the requested axis',
Expand All @@ -7406,10 +7408,10 @@ def compound(self, axis=None, skipna=None, level=None):
"by N-1\n",
nanops.nankurt)
cls.kurtosis = cls.kurt
cls.prod = _make_stat_function(
cls.prod = _make_min_count_stat_function(
cls, 'prod', name, name2, axis_descr,
'Return the product of the values for the requested axis',
nanops.nanprod)
nanops.nanprod, _prod_examples)
cls.product = cls.prod
cls.median = _make_stat_function(
cls, 'median', name, name2, axis_descr,
Expand Down Expand Up @@ -7540,10 +7542,13 @@ def _doc_parms(cls):
numeric_only : boolean, default None
Include only float, int, boolean columns. If None, will attempt to use
everything, then use only numeric data. Not implemented for Series.
%(min_count)s\

Returns
-------
%(outname)s : %(name1)s or %(name2)s (if level specified)\n"""
%(outname)s : %(name1)s or %(name2)s (if level specified)

%(examples)s"""

_num_ddof_doc = """

Expand Down Expand Up @@ -7611,9 +7616,92 @@ def _doc_parms(cls):
"""


_sum_examples = """\
Examples
--------
By default, the sum of an empty series is ``NaN``.

>>> pd.Series([]).sum() # min_count=1 is the default
nan

This can be controlled with the ``min_count`` parameter. For example, if
you'd like the sum of an empty series to be 0, pass ``min_count=0``.

>>> pd.Series([]).sum(min_count=0)
0.0

Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
empty series identically.

>>> pd.Series([np.nan]).sum()
nan

>>> pd.Series([np.nan]).sum(min_count=0)
0.0
"""

_prod_examples = """\
Examples
--------
By default, the product of an empty series is ``NaN``

>>> pd.Series([]).prod()
nan

This can be controlled with the ``min_count`` parameter

>>> pd.Series([]).prod(min_count=0)
1.0

Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
empty series identically.

>>> pd.Series([np.nan]).prod()
nan

>>> pd.Series([np.nan]).sum(min_count=0)
1.0
"""


_min_count_stub = """\
min_count : int, default 1
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.

.. versionadded :: 0.21.2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably it will become 0.22 ? (but can change later)


Added with the default being 1. This means the sum or product
of an all-NA or empty series is ``NaN``.
"""


def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
f, examples):
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
axis_descr=axis_descr, min_count=_min_count_stub,
examples=examples)
@Appender(_num_doc)
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
min_count=1,
**kwargs):
nv.validate_stat_func(tuple(), kwargs, fname=name)
if skipna is None:
skipna = True
if axis is None:
axis = self._stat_axis_number
if level is not None:
return self._agg_by_level(name, axis=axis, level=level,
skipna=skipna, min_count=min_count)
return self._reduce(f, name, axis=axis, skipna=skipna,
numeric_only=numeric_only, min_count=min_count)

return set_function_name(stat_func, name, cls)


def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
@Substitution(outname=name, desc=desc, name1=name1, name2=name2,
axis_descr=axis_descr)
axis_descr=axis_descr, min_count='', examples='')
@Appender(_num_doc)
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
Expand Down
Loading