pandas-dev · TomAugspurger · Dec 28, 2017 · Dec 20, 2017 · jreback · Dec 21, 2017
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -36,7 +36,8 @@ def get_dispatch(dtypes):
 def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{c_type}}, ndim=2] values,
-                       ndarray[int64_t] labels):
+                       ndarray[int64_t] labels,
+                       Py_ssize_t min_count=1):
     """
     Only aggregates on axis=0
     """
@@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
         for i in range(ncounts):
             for j in range(K):
-                if nobs[i, j] == 0:
+                if nobs[i, j] < min_count:
                     out[i, j] = NAN
                 else:
                     out[i, j] = sumx[i, j]
@@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                         ndarray[int64_t] counts,
                         ndarray[{{c_type}}, ndim=2] values,
-                        ndarray[int64_t] labels):
+                        ndarray[int64_t] labels,
+                        Py_ssize_t min_count=1):
     """
     Only aggregates on axis=0
     """
@@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 
         for i in range(ncounts):
             for j in range(K):
-                if nobs[i, j] == 0:
+                if nobs[i, j] < min_count:
                     out[i, j] = NAN
                 else:
                     out[i, j] = prodx[i, j]
@@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels):
+                       ndarray[int64_t] labels,
+                       Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         {{dest_type2}} val, ct, oldmean
         ndarray[{{dest_type2}}, ndim=2] nobs, mean
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
@@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                         ndarray[int64_t] counts,
                         ndarray[{{dest_type2}}, ndim=2] values,
-                        ndarray[int64_t] labels):
+                        ndarray[int64_t] labels,
+                        Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         {{dest_type2}} val, count
         ndarray[{{dest_type2}}, ndim=2] sumx, nobs
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
@@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[{{dest_type2}}, ndim=2] values,
-                  ndarray[int64_t] labels):
+                  ndarray[int64_t] labels,
+                  Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         {{dest_type2}} val, count
         Py_ssize_t ngroups = len(counts)
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if len(labels) == 0:
         return
 
@@ -332,7 +343,8 @@ def get_dispatch(dtypes):
 def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                         ndarray[int64_t] counts,
                         ndarray[{{c_type}}, ndim=2] values,
-                        ndarray[int64_t] labels):
+                        ndarray[int64_t] labels,
+                        Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
@@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{c_type}}, ndim=2] values,
-                       ndarray[int64_t] labels, int64_t rank):
+                       ndarray[int64_t] labels, int64_t rank,
+                       Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
@@ -455,7 +472,8 @@ def get_dispatch(dtypes):
 def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels):
+                       ndarray[int64_t] labels,
+                       Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         {{dest_type2}} val, count
         ndarray[{{dest_type2}}, ndim=2] maxx, nobs
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
@@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{dest_type2}}, ndim=2] values,
-                       ndarray[int64_t] labels):
+                       ndarray[int64_t] labels,
+                       Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         {{dest_type2}} val, count
         ndarray[{{dest_type2}}, ndim=2] minx, nobs
 
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
@@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_median_float64(ndarray[float64_t, ndim=2] out,
                          ndarray[int64_t] counts,
                          ndarray[float64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
+                         ndarray[int64_t] labels,
+                         Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
         ndarray[int64_t] _counts
         ndarray data
         float64_t* ptr
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
     ngroups = len(counts)
     N, K = (<object> values).shape
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7322,7 +7322,8 @@ def _add_numeric_operations(cls):
         @Substitution(outname='mad',
                       desc="Return the mean absolute deviation of the values "
                            "for the requested axis",
-                      name1=name, name2=name2, axis_descr=axis_descr)
+                      name1=name, name2=name2, axis_descr=axis_descr,
+                      min_count='', examples='')
         @Appender(_num_doc)
         def mad(self, axis=None, skipna=None, level=None):
             if skipna is None:
@@ -7363,7 +7364,8 @@ def mad(self, axis=None, skipna=None, level=None):
         @Substitution(outname='compounded',
                       desc="Return the compound percentage of the values for "
                       "the requested axis", name1=name, name2=name2,
-                      axis_descr=axis_descr)
+                      axis_descr=axis_descr,
+                      min_count='', examples='')
         @Appender(_num_doc)
         def compound(self, axis=None, skipna=None, level=None):
             if skipna is None:
@@ -7387,10 +7389,10 @@ def compound(self, axis=None, skipna=None, level=None):
             lambda y, axis: np.maximum.accumulate(y, axis), "max",
             -np.inf, np.nan)
 
-        cls.sum = _make_stat_function(
+        cls.sum = _make_min_count_stat_function(
             cls, 'sum', name, name2, axis_descr,
             'Return the sum of the values for the requested axis',
-            nanops.nansum)
+            nanops.nansum, _sum_examples)
         cls.mean = _make_stat_function(
             cls, 'mean', name, name2, axis_descr,
             'Return the mean of the values for the requested axis',
@@ -7406,10 +7408,10 @@ def compound(self, axis=None, skipna=None, level=None):
             "by N-1\n",
             nanops.nankurt)
         cls.kurtosis = cls.kurt
-        cls.prod = _make_stat_function(
+        cls.prod = _make_min_count_stat_function(
             cls, 'prod', name, name2, axis_descr,
             'Return the product of the values for the requested axis',
-            nanops.nanprod)
+            nanops.nanprod, _prod_examples)
         cls.product = cls.prod
         cls.median = _make_stat_function(
             cls, 'median', name, name2, axis_descr,
@@ -7540,10 +7542,13 @@ def _doc_parms(cls):
 numeric_only : boolean, default None
     Include only float, int, boolean columns. If None, will attempt to use
     everything, then use only numeric data. Not implemented for Series.
+%(min_count)s\
 
 Returns
 -------
-%(outname)s : %(name1)s or %(name2)s (if level specified)\n"""
+%(outname)s : %(name1)s or %(name2)s (if level specified)
+
+%(examples)s"""
 
 _num_ddof_doc = """
 
@@ -7611,9 +7616,92 @@ def _doc_parms(cls):
 """
 
 
+_sum_examples = """\
+Examples
+--------
+By default, the sum of an empty series is ``NaN``.
+
+>>> pd.Series([]).sum()  # min_count=1 is the default
+nan
+
+This can be controlled with the ``min_count`` parameter. For example, if
+you'd like the sum of an empty series to be 0, pass ``min_count=0``.
+
+>>> pd.Series([]).sum(min_count=0)
+0.0
+
+Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+empty series identically.
+
+>>> pd.Series([np.nan]).sum()
+nan
+
+>>> pd.Series([np.nan]).sum(min_count=0)
+0.0
+"""
+
+_prod_examples = """\
+Examples
+--------
+By default, the product of an empty series is ``NaN``
+
+>>> pd.Series([]).prod()
+nan
+
+This can be controlled with the ``min_count`` parameter
+
+>>> pd.Series([]).prod(min_count=0)
+1.0
+
+Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+empty series identically.
+
+>>> pd.Series([np.nan]).prod()
+nan
+
+>>> pd.Series([np.nan]).sum(min_count=0)
+1.0
+"""
+
+
+_min_count_stub = """\
+min_count : int, default 1
+    The required number of valid values to perform the operation. If fewer than
+    ``min_count`` non-NA values are present the result will be NA.
+
+    .. versionadded :: 0.21.2
+
+       Added with the default being 1. This means the sum or product
+       of an all-NA or empty series is ``NaN``.
+"""
+
+
+def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
+                                  f, examples):
+    @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+                  axis_descr=axis_descr, min_count=_min_count_stub,
+                  examples=examples)
+    @Appender(_num_doc)
+    def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
+                  min_count=1,
+                  **kwargs):
+        nv.validate_stat_func(tuple(), kwargs, fname=name)
+        if skipna is None:
+            skipna = True
+        if axis is None:
+            axis = self._stat_axis_number
+        if level is not None:
+            return self._agg_by_level(name, axis=axis, level=level,
+                                      skipna=skipna, min_count=min_count)
+        return self._reduce(f, name, axis=axis, skipna=skipna,
+                            numeric_only=numeric_only, min_count=min_count)
+
+    return set_function_name(stat_func, name, cls)
+
+
 def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
     @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
-                  axis_descr=axis_descr)
+                  axis_descr=axis_descr, min_count='', examples='')
     @Appender(_num_doc)
     def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
                   **kwargs):