Skip to content

API: Series.sum() will now return 0.0 for all-NaN series #10815

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,20 @@ Other API Changes

- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)

- ``Series.sum()`` will now return 0.0, and ``Series.prod()`` will return 1.0 for all-NaN series rather than ``NaN``; this is for compat with ``numpy`` >= 1.8.2 and ``bottleneck`` >= 1.0 (:issue:`9422`).

.. ipython:: python

s = Series([np.nan])
s.sum()
s.sum(skipna=False)
s.prod()
s.prod(skipna=False)

.. warning::

``bottleneck`` is used for these calculations. If you have ``bottleneck`` < 1.0, then these will all return ``NaN``.

.. _whatsnew_0170.deprecations:

Deprecations
Expand Down
1 change: 1 addition & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_np_version = np.version.short_version
_np_version_under1p8 = LooseVersion(_np_version) < '1.8'
_np_version_under1p9 = LooseVersion(_np_version) < '1.9'
_np_version_under1p10 = LooseVersion(_np_version) < '1.10'


from pandas.info import __doc__
Expand Down
30 changes: 19 additions & 11 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ class SpecificationError(GroupByError):


def _groupby_function(name, alias, npfunc, numeric_only=True,
_convert=False):
fillna=None, _convert=False):
def f(self):
self._set_selection_from_grouper()
try:
return self._cython_agg_general(alias, numeric_only=numeric_only)
return self._cython_agg_general(alias, numeric_only=numeric_only, fillna=fillna)
except AssertionError as e:
raise SpecificationError(str(e))
except Exception:
Expand Down Expand Up @@ -793,8 +793,8 @@ def size(self):
"""
return self.grouper.size()

sum = _groupby_function('sum', 'add', np.sum)
prod = _groupby_function('prod', 'prod', np.prod)
sum = _groupby_function('sum', 'add', np.sum, fillna=0.0)
prod = _groupby_function('prod', 'prod', np.prod, fillna=1.0)
min = _groupby_function('min', 'min', np.min, numeric_only=False)
max = _groupby_function('max', 'max', np.max, numeric_only=False)
first = _groupby_function('first', 'first', _first_compat,
Expand Down Expand Up @@ -1118,15 +1118,15 @@ def _try_cast(self, result, obj):

return result

def _cython_agg_general(self, how, numeric_only=True):
def _cython_agg_general(self, how, numeric_only=True, fillna=None):
output = {}
for name, obj in self._iterate_slices():
is_numeric = is_numeric_dtype(obj.dtype)
if numeric_only and not is_numeric:
continue

try:
result, names = self.grouper.aggregate(obj.values, how)
result, names = self.grouper.aggregate(obj.values, how, fillna=fillna)
except AssertionError as e:
raise GroupByError(str(e))
output[name] = self._try_cast(result, obj)
Expand Down Expand Up @@ -1511,7 +1511,7 @@ def wrapper(*args, **kwargs):
(how, dtype_str))
return func, dtype_str

def aggregate(self, values, how, axis=0):
def aggregate(self, values, how, axis=0, fillna=None):
arity = self._cython_arity.get(how, 1)

vdim = values.ndim
Expand All @@ -1534,14 +1534,18 @@ def aggregate(self, values, how, axis=0):
values = values.view('int64')
# GH 7754
is_numeric = True
fillna = None
elif is_bool_dtype(values.dtype):
values = _algos.ensure_float64(values)
fillna = None
elif com.is_integer_dtype(values):
values = values.astype('int64', copy=False)
fillna = None
elif is_numeric:
values = _algos.ensure_float64(values)
else:
values = values.astype(object)
fillna = None

try:
agg_func, dtype_str = self._get_aggregate_function(how, values)
Expand All @@ -1564,6 +1568,10 @@ def aggregate(self, values, how, axis=0):

result = self._aggregate(result, counts, values, agg_func, is_numeric)

# if we have a non-None fillna, then replace
if fillna is not None:
result[np.isnan(result)] = fillna

if com.is_integer_dtype(result):
if len(result[result == tslib.iNaT]) > 0:
result = result.astype('float64')
Expand Down Expand Up @@ -2581,8 +2589,8 @@ def _iterate_slices(self):
continue
yield val, slicer(val)

def _cython_agg_general(self, how, numeric_only=True):
new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
def _cython_agg_general(self, how, numeric_only=True, fillna=None):
new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only, fillna=fillna)
return self._wrap_agged_blocks(new_items, new_blocks)

def _wrap_agged_blocks(self, items, blocks):
Expand All @@ -2608,7 +2616,7 @@ def _wrap_agged_blocks(self, items, blocks):

_block_agg_axis = 0

def _cython_agg_blocks(self, how, numeric_only=True):
def _cython_agg_blocks(self, how, numeric_only=True, fillna=None):
data, agg_axis = self._get_data_to_aggregate()

new_blocks = []
Expand All @@ -2620,7 +2628,7 @@ def _cython_agg_blocks(self, how, numeric_only=True):

values = block._try_operate(block.values)

result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
result, _ = self.grouper.aggregate(values, how, axis=agg_axis, fillna=fillna)

# see if we can cast the block back to the original dtype
result = block._try_coerce_and_cast_result(result)
Expand Down
46 changes: 40 additions & 6 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
_USE_BOTTLENECK = False

import pandas.hashtable as _hash
from pandas import compat, lib, algos, tslib
from pandas import compat, lib, algos, tslib, _np_version_under1p10
from pandas.compat import builtins
from pandas.core.common import (isnull, notnull, _values_from_object,
_maybe_upcast_putmask,
Expand Down Expand Up @@ -243,12 +243,14 @@ def nanall(values, axis=None, skipna=True):
@disallow('M8')
@bottleneck_switch(zero_value=0)
def nansum(values, axis=None, skipna=True):
dtype = values.dtype
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
dtype_sum = dtype_max
if is_float_dtype(dtype):
dtype_sum = dtype
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask)
the_sum = _maybe_null_out(the_sum, axis, mask, allow_all_null=not skipna,
dtype=dtype, fill_value=0)

return _wrap_results(the_sum, dtype)

Expand Down Expand Up @@ -549,12 +551,14 @@ def nankurt(values, axis=None, skipna=True):

@disallow('M8','m8')
def nanprod(values, axis=None, skipna=True):
dtype = values.dtype
mask = isnull(values)
if skipna and not is_any_int_dtype(values):
values = values.copy()
values[mask] = 1
result = values.prod(axis)
return _maybe_null_out(result, axis, mask)
return _maybe_null_out(result, axis, mask, allow_all_null=not skipna, dtype=dtype,
fill_value=1)


def _maybe_arg_null_out(result, axis, mask, skipna):
Expand Down Expand Up @@ -588,19 +592,49 @@ def _get_counts(mask, axis, dtype=float):
return np.array(count, dtype=dtype)


def _maybe_null_out(result, axis, mask):
def _maybe_null_out(result, axis, mask, allow_all_null=True, dtype=None, fill_value=None):


# 9422
# if we have all nulls we normally return a
# null, but for numpy >= 1.8.2 and bottleneck >= 1.0
# nansum/nanprod are set to be the fill_values
if not allow_all_null and dtype is not None:

if is_complex_dtype(dtype) or not is_float_dtype(dtype):

# we don't mask complex
# object or non-floats
# if numpy changes this, we will as well

# IOW, np.nansum(np.array([np.nan],dtype='object')) is np.nan
# https://github.com/numpy/numpy/issues/6209
allow_all_null = True
fill_value = np.nan

else:
fill_value = np.nan

if axis is not None and getattr(result, 'ndim', False):
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
if np.any(null_mask):
if np.iscomplexobj(result):
result = result.astype('c16')
else:
result = result.astype('f8')

# mark nans
result[null_mask] = np.nan

# masker if for only all nan
if not allow_all_null:
null_mask = mask.all(axis)
if null_mask.any():
result[null_mask] = fill_value
else:
null_mask = mask.size - mask.sum()
if null_mask == 0:
result = np.nan
if null_mask == 0 and (mask.size > 0 or allow_all_null):
result = fill_value

return result

Expand Down
44 changes: 34 additions & 10 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12230,10 +12230,10 @@ def test_count(self):
assert_series_equal(result, expected)

def test_sum(self):
self._check_stat_op('sum', np.sum, has_numeric_only=True)
self._check_stat_op('sum', np.sum, has_numeric_only=True, fillna=0.0)

# mixed types (with upcasting happening)
self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'),
self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), fillna=0.0,
has_numeric_only=True, check_dtype=False, check_less_precise=True)

def test_stat_operators_attempt_obj_array(self):
Expand All @@ -12247,23 +12247,32 @@ def test_stat_operators_attempt_obj_array(self):
df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
dtype='O')
methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']
fills = [0.0, np.nan, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan]

# GH #676
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
2: [np.nan, 4]}, dtype=object)

for df in [df1, df2]:
for meth in methods:
for meth, fill in zip(methods, fills):
self.assertEqual(df.values.dtype, np.object_)
result = getattr(df, meth)(1)

# 9422
# all-NaN object array is still NaN, while floats are not :<
expected = getattr(df.astype('f8'), meth)(1)
if not np.isnan(fill):
mask = df.isnull().all(1)
if mask.any():
expected[mask] = np.nan

assert_series_equal(result, expected)

def test_mean(self):
self._check_stat_op('mean', np.mean, check_dates=True)

def test_product(self):
self._check_stat_op('product', np.prod)
self._check_stat_op('product', np.prod, fillna=1.0)

def test_median(self):
def wrapper(x):
Expand Down Expand Up @@ -12435,7 +12444,7 @@ def alt(x):

def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
has_numeric_only=False, check_dtype=True, check_dates=False,
check_less_precise=False):
check_less_precise=False, fillna=None):
if frame is None:
frame = self.frame
# set some NAs
Expand Down Expand Up @@ -12478,11 +12487,20 @@ def wrapper(x):
wrapper = alternative

result0 = f(axis=0)
result1 = f(axis=1)
assert_series_equal(result0, frame.apply(skipna_wrapper),
expected0 = frame.apply(skipna_wrapper)
assert_series_equal(result0, expected0,
check_dtype=check_dtype,
check_less_precise=check_less_precise)
assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),

result1 = f(axis=1)

# 9422
# all-nan rows get the fillna
expected1 = frame.apply(skipna_wrapper, axis=1)
if fillna is not None:
expected1[isnull(frame).all(axis=1)] = fillna

assert_series_equal(result1, expected1,
check_dtype=False,
check_less_precise=check_less_precise)

Expand Down Expand Up @@ -12513,8 +12531,14 @@ def wrapper(x):
all_na = self.frame * np.NaN
r0 = getattr(all_na, name)(axis=0)
r1 = getattr(all_na, name)(axis=1)
self.assertTrue(np.isnan(r0).all())
self.assertTrue(np.isnan(r1).all())

# 9422
if fillna is not None:
self.assertTrue((r0==fillna).all())
self.assertTrue((r1==fillna).all())
else:
self.assertTrue(np.isnan(r0).all())
self.assertTrue(np.isnan(r1).all())

def test_mode(self):
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
Expand Down
Loading