diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index d38b677df321c..16b7cbff44e03 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -36,7 +36,8 @@ def get_dispatch(dtypes): def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=1): """ Only aggregates on axis=0 """ @@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = sumx[i, j] @@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=1): """ Only aggregates on axis=0 """ @@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = prodx[i, j] @@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, ct, oldmean ndarray[{{dest_type2}}, ndim=2] nobs, mean + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] sumx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count Py_ssize_t ngroups = len(counts) + assert min_count == -1, "'min_count' only used in add and prod" + if len(labels) == 0: return @@ -332,7 +343,8 @@ def get_dispatch(dtypes): def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): + ndarray[int64_t] labels, int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -455,7 +472,8 @@ def get_dispatch(dtypes): def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] maxx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] minx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] _counts ndarray data float64_t* ptr + + assert min_count == -1, "'min_count' only used in add and prod" + ngroups = len(counts) N, K = ( values).shape diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f2dbb3ef4d32a..2acf64f1d9f74 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7322,7 +7322,8 @@ def _add_numeric_operations(cls): @Substitution(outname='mad', desc="Return the mean absolute deviation of the values " "for the requested axis", - name1=name, name2=name2, axis_descr=axis_descr) + name1=name, name2=name2, axis_descr=axis_descr, + min_count='', examples='') @Appender(_num_doc) def mad(self, axis=None, skipna=None, level=None): if skipna is None: @@ -7363,7 +7364,8 @@ def mad(self, axis=None, skipna=None, level=None): @Substitution(outname='compounded', desc="Return the compound percentage of the values for " "the requested axis", name1=name, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, + min_count='', examples='') @Appender(_num_doc) def compound(self, axis=None, skipna=None, level=None): if skipna is None: @@ -7387,10 +7389,10 @@ def compound(self, axis=None, skipna=None, level=None): lambda y, axis: np.maximum.accumulate(y, axis), "max", -np.inf, np.nan) - cls.sum = _make_stat_function( + cls.sum = _make_min_count_stat_function( cls, 'sum', name, name2, axis_descr, 'Return the sum of the values for the requested axis', - nanops.nansum) + nanops.nansum, _sum_examples) cls.mean = _make_stat_function( cls, 'mean', name, name2, axis_descr, 'Return the mean of the values for the requested axis', @@ -7406,10 +7408,10 @@ def compound(self, axis=None, skipna=None, level=None): "by N-1\n", nanops.nankurt) cls.kurtosis = cls.kurt - cls.prod = _make_stat_function( + cls.prod = _make_min_count_stat_function( cls, 'prod', name, name2, axis_descr, 'Return the product of the values for the requested axis', - nanops.nanprod) + nanops.nanprod, _prod_examples) cls.product = cls.prod cls.median = _make_stat_function( cls, 'median', name, name2, axis_descr, @@ -7540,10 +7542,13 @@ def _doc_parms(cls): numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. +%(min_count)s\ Returns ------- -%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" +%(outname)s : %(name1)s or %(name2)s (if level specified) + +%(examples)s""" _num_ddof_doc = """ @@ -7611,9 +7616,92 @@ def _doc_parms(cls): """ +_sum_examples = """\ +Examples +-------- +By default, the sum of an empty series is ``NaN``. + +>>> pd.Series([]).sum() # min_count=1 is the default +nan + +This can be controlled with the ``min_count`` parameter. For example, if +you'd like the sum of an empty series to be 0, pass ``min_count=0``. + +>>> pd.Series([]).sum(min_count=0) +0.0 + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).sum() +nan + +>>> pd.Series([np.nan]).sum(min_count=0) +0.0 +""" + +_prod_examples = """\ +Examples +-------- +By default, the product of an empty series is ``NaN`` + +>>> pd.Series([]).prod() +nan + +This can be controlled with the ``min_count`` parameter + +>>> pd.Series([]).prod(min_count=0) +1.0 + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).prod() +nan + +>>> pd.Series([np.nan]).sum(min_count=0) +1.0 +""" + + +_min_count_stub = """\ +min_count : int, default 1 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + + .. versionadded :: 0.21.2 + + Added with the default being 1. This means the sum or product + of an all-NA or empty series is ``NaN``. +""" + + +def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, + f, examples): + @Substitution(outname=name, desc=desc, name1=name1, name2=name2, + axis_descr=axis_descr, min_count=_min_count_stub, + examples=examples) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, + min_count=1, + **kwargs): + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, min_count=min_count) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=numeric_only, min_count=min_count) + + return set_function_name(stat_func, name, cls) + + def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, min_count='', examples='') @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 47b80c00da4d4..041239ed06d88 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -986,7 +986,8 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, + min_count=-1): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -994,7 +995,8 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True): continue try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(obj.values, how, + min_count=min_count) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -1301,7 +1303,8 @@ def _add_numeric_operations(cls): """ add numeric operations to the GroupBy generically """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False): + numeric_only=True, _convert=False, + min_count=-1): _local_template = "Compute %(f)s of group values" @@ -1311,6 +1314,8 @@ def groupby_function(name, alias, npfunc, def f(self, **kwargs): if 'numeric_only' not in kwargs: kwargs['numeric_only'] = numeric_only + if 'min_count' not in kwargs: + kwargs['min_count'] = min_count self._set_group_selection() try: return self._cython_agg_general( @@ -1358,8 +1363,8 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum) - cls.prod = groupby_function('prod', 'prod', np.prod) + cls.sum = groupby_function('sum', 'add', np.sum, min_count=1) + cls.prod = groupby_function('prod', 'prod', np.prod, min_count=1) cls.min = groupby_function('min', 'min', np.min, numeric_only=False) cls.max = groupby_function('max', 'max', np.max, numeric_only=False) cls.first = groupby_function('first', 'first', first_compat, @@ -2139,7 +2144,7 @@ def get_group_levels(self): 'var': 'group_var', 'first': { 'name': 'group_nth', - 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) + 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1) }, 'last': 'group_last', 'ohlc': 'group_ohlc', @@ -2209,7 +2214,7 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func, dtype_str - def _cython_operation(self, kind, values, how, axis): + def _cython_operation(self, kind, values, how, axis, min_count=-1): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2294,11 +2299,12 @@ def _cython_operation(self, kind, values, how, axis): counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, - is_datetimelike) + is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) + # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike) @@ -2335,14 +2341,15 @@ def _cython_operation(self, kind, values, how, axis): return result, names - def aggregate(self, values, how, axis=0): - return self._cython_operation('aggregate', values, how, axis) + def aggregate(self, values, how, axis=0, min_count=-1): + return self._cython_operation('aggregate', values, how, axis, + min_count=min_count) def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, min_count=-1): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -2351,9 +2358,10 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids) + agg_func(result[:, :, i], counts, chunk, comp_ids, + min_count) else: - agg_func(result, counts, values, comp_ids) + agg_func(result, counts, values, comp_ids, min_count) return result @@ -3643,9 +3651,10 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, + min_count=-1): new_items, new_blocks = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only) + how, alt=alt, numeric_only=numeric_only, min_count=min_count) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3671,7 +3680,8 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, alt=None, numeric_only=True): + def _cython_agg_blocks(self, how, alt=None, numeric_only=True, + min_count=-1): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine @@ -3688,7 +3698,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + block.values, how, axis=agg_axis, min_count=min_count) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e1c09947ac0b4..88f69f6ff2e14 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -107,21 +107,9 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if values.size == 0: - - # we either return np.nan or pd.NaT - if is_numeric_dtype(values): - values = values.astype('float64') - fill_value = na_value_for_dtype(values.dtype) - - if values.ndim == 1: - return fill_value - else: - result_shape = (values.shape[:axis] + - values.shape[axis + 1:]) - result = np.empty(result_shape, dtype=values.dtype) - result.fill(fill_value) - return result + if values.size == 0 and kwds.get('min_count') is None: + # We are empty, returning NA for our type + return _na_for_min_count(values, axis) if (_USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name)): @@ -292,6 +280,22 @@ def _wrap_results(result, dtype): return result +def _na_for_min_count(values, axis): + # we either return np.nan or pd.NaT + if is_numeric_dtype(values): + values = values.astype('float64') + fill_value = na_value_for_dtype(values.dtype) + + if values.ndim == 1: + return fill_value + else: + result_shape = (values.shape[:axis] + + values.shape[axis + 1:]) + result = np.empty(result_shape, dtype=values.dtype) + result.fill(fill_value) + return result + + def nanany(values, axis=None, skipna=True): values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) return values.any(axis) @@ -304,7 +308,7 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nansum(values, axis=None, skipna=True): +def nansum(values, axis=None, skipna=True, min_count=1): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -312,7 +316,7 @@ def nansum(values, axis=None, skipna=True): elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask) + the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype) @@ -641,13 +645,13 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True): +def nanprod(values, axis=None, skipna=True, min_count=1): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, min_count=min_count) def _maybe_arg_null_out(result, axis, mask, skipna): @@ -683,9 +687,9 @@ def _get_counts(mask, axis, dtype=float): return np.array(count, dtype=dtype) -def _maybe_null_out(result, axis, mask): +def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, 'ndim', False): - null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): @@ -698,7 +702,7 @@ def _maybe_null_out(result, axis, mask): result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() - if null_mask == 0: + if null_mask < min_count: result = np.nan return result diff --git a/pandas/core/resample.py b/pandas/core/resample.py index c2bf7cff746eb..a30c727ecb87c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -625,9 +625,20 @@ def size(self): Resampler._deprecated_valids += dir(Resampler) + +# downsample methods +for method in ['sum', 'prod']: + + def f(self, _method=method, min_count=1, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) + return self._downsample(_method, min_count=min_count) + f.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, f) + + # downsample methods -for method in ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', - 'median', 'prod', 'ohlc']: +for method in ['min', 'max', 'first', 'last', 'mean', 'sem', + 'median', 'ohlc']: def f(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 17d711f937bf7..80e9acd0d2281 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -973,6 +973,37 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_sum_prod_nanops(self, method, unit): + idx = ['a', 'b', 'c'] + df = pd.DataFrame({"a": [unit, unit], + "b": [unit, np.nan], + "c": [np.nan, np.nan]}) + + result = getattr(df, method)(min_count=1) + expected = pd.Series([unit, unit, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + result = getattr(df, method)(min_count=0) + expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + tm.assert_series_equal(result, expected) + + result = getattr(df.iloc[1:], method)(min_count=1) + expected = pd.Series([unit, np.nan, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) + result = getattr(df, method)(min_count=5) + expected = pd.Series(result, index=['A', 'B']) + tm.assert_series_equal(result, expected) + + result = getattr(df, method)(min_count=6) + expected = pd.Series(result, index=['A', 'B']) + tm.assert_series_equal(result, expected) + def test_sum_object(self): values = self.frame.values.astype(int) frame = DataFrame(values, index=self.frame.index, diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 3d27df31cee6e..07ecc085098bf 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -809,26 +809,33 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise - def test_cython_agg_empty_buckets(self): - ops = [('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), ] - + @pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), + ('var', lambda x: np.var(x, ddof=1)), + ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), ] + ) + def test_cython_agg_empty_buckets(self, op, targop): df = pd.DataFrame([11, 12, 13]) grps = range(0, 55, 5) - for op, targop in ops: - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + if op in ('add', 'prod'): + min_count = 1 + else: + min_count = -1 + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general( + op, min_count=min_count) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise def test_agg_over_numpy_arrays(self): # GH 3788 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c73423921898d..5e3d2bb9cf091 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -662,3 +662,48 @@ def test_groupby_categorical_two_columns(self): "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + + def test_empty_sum(self): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # NA by default + result = df.groupby("A").B.sum() + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.sum(min_count=0) + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.sum(min_count=1) + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + def test_empty_prod(self): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # NA by default + result = df.groupby("A").B.prod() + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.prod(min_count=0) + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.prod(min_count=1) + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0dae6aa96ced1..cd92edc927173 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -30,38 +30,122 @@ class TestSeriesAnalytics(TestData): @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method", ["sum", "prod"]) - def test_empty(self, method, use_bottleneck): - + @pytest.mark.parametrize("method, unit", [ + ("sum", 0.0), + ("prod", 1.0) + ]) + def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): # GH 9422 - # treat all missing as NaN + # Entirely empty s = Series([]) + # NA by default result = getattr(s, method)() assert isna(result) + # Explict + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert isna(result) + + # Skipna, default result = getattr(s, method)(skipna=True) assert isna(result) + # Skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert isna(result) + + # All-NA s = Series([np.nan]) + # NA by default result = getattr(s, method)() assert isna(result) + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert isna(result) + + # Skipna, default result = getattr(s, method)(skipna=True) assert isna(result) + # skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert isna(result) + + # Mix of valid, empty s = Series([np.nan, 1]) + # Default result = getattr(s, method)() assert result == 1.0 - s = Series([np.nan, 1]) + # Explicit + result = getattr(s, method)(min_count=0) + assert result == 1.0 + + result = getattr(s, method)(min_count=1) + assert result == 1.0 + + # Skipna result = getattr(s, method)(skipna=True) assert result == 1.0 + result = getattr(s, method)(skipna=True, min_count=0) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=1) + assert result == 1.0 + # GH #844 (changed in 9422) df = DataFrame(np.empty((10, 0))) assert (df.sum(1).isnull()).all() + s = pd.Series([1]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + s = pd.Series([np.nan]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + s = pd.Series([np.nan, 1]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0.0), + ('prod', 1.0), + ]) + def test_empty_multi(self, method, unit): + s = pd.Series([1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + # NaN by default + result = getattr(s, method)(level=0) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(s, method)(level=0, min_count=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(s, method)(level=0, min_count=1) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "method", ['sum', 'mean', 'median', 'std', 'var']) def test_ops_consistency_on_empty(self, method): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 38f4b8be469a5..4a3c4eff9f8c3 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta from functools import partial from textwrap import dedent +from operator import methodcaller import pytz import pytest @@ -3382,6 +3383,34 @@ def test_aggregate_normal(self): assert_frame_equal(expected, dt_result) """ + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_resample_entirly_nat_window(self, method, unit): + s = pd.Series([0] * 2 + [np.nan] * 2, + index=pd.date_range('2017', periods=4)) + # nan by default + result = methodcaller(method)(s.resample("2d")) + expected = pd.Series([0.0, np.nan], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = methodcaller(method, min_count=0)(s.resample("2d")) + expected = pd.Series([0.0, unit], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = methodcaller(method, min_count=1)(s.resample("2d")) + expected = pd.Series([0.0, np.nan], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + def test_aggregate_with_nat(self): # check TimeGrouper's aggregation is identical as normal groupby @@ -3441,3 +3470,29 @@ def test_repr(self): "closed='left', label='left', how='mean', " "convention='e', base=0)") assert result == expected + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_upsample_sum(self, method, unit): + s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) + resampled = s.resample("30T") + index = pd.to_datetime(['2017-01-01T00:00:00', + '2017-01-01T00:30:00', + '2017-01-01T01:00:00']) + + # NaN by default + result = methodcaller(method)(resampled) + expected = pd.Series([1, np.nan, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = methodcaller(method, min_count=0)(resampled) + expected = pd.Series([1, unit, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = methodcaller(method, min_count=1)(resampled) + expected = pd.Series([1, np.nan, 1], index=index) + tm.assert_series_equal(result, expected)