Skip to content

ENH: Move any/all to NDFrame, support additional arguments for Series. GH8302 #8550

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,20 @@ API changes

- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)

- ``Series.all`` and ``Series.any`` now support the ``level`` and ``skipna`` parameters. ``Series.all``, ``Series.any``, ``Index.all``, and ``Index.any`` no longer support the ``out`` and ``keepdims`` parameters, which existed for compatibility with ndarray. Various index types no longer support the ``all`` and ``any`` aggregation functions. (:issue:`8302`):

.. ipython:: python

s = pd.Series([False, True, False], index=[0, 0, 1])
s.any(level=0)

- ``Panel`` now supports the ``all`` and ``any`` aggregation functions. (:issue:`8302`):

.. ipython:: python

p = pd.Panel(np.random.rand(2, 5, 4) > 0.1)
p.all()

.. _whatsnew_0152.enhancements:

Enhancements
Expand All @@ -44,4 +58,4 @@ Experimental
Bug Fixes
~~~~~~~~~
- Bug in ``groupby`` signatures that didn't include *args or **kwargs (:issue:`8733`).
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo (:issue:`8761`).
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo (:issue:`8761`).
18 changes: 0 additions & 18 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,18 +268,6 @@ def __unicode__(self):
quote_strings=True)
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)

def _unbox(func):
@Appender(func.__doc__)
def f(self, *args, **kwargs):
result = func(self.values, *args, **kwargs)
from pandas.core.index import Index
if isinstance(result, (np.ndarray, com.ABCSeries, Index)) and result.ndim == 0:
# return NumPy type
return result.dtype.type(result.item())
else: # pragma: no cover
return result
f.__name__ = func.__name__
return f

class IndexOpsMixin(object):
""" common ops mixin to support a unified inteface / docs for Series / Index """
Expand Down Expand Up @@ -528,12 +516,6 @@ def duplicated(self, take_last=False):
from pandas.core.index import Index
return Index(duplicated)

#----------------------------------------------------------------------
# unbox reductions

all = _unbox(np.ndarray.all)
any = _unbox(np.ndarray.any)

#----------------------------------------------------------------------
# abstracts

Expand Down
62 changes: 0 additions & 62 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4133,68 +4133,6 @@ def _count_level(self, level, axis=0, numeric_only=False):
else:
return result

def any(self, axis=None, bool_only=None, skipna=True, level=None,
**kwargs):
"""
Return whether any element is True over requested axis.
%(na_action)s

Parameters
----------
axis : {0, 1}
0 for row-wise, 1 for column-wise
skipna : boolean, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a
particular level, collapsing into a DataFrame
bool_only : boolean, default None
Only include boolean data.

Returns
-------
any : Series (or DataFrame if level specified)
"""
if axis is None:
axis = self._stat_axis_number
if level is not None:
return self._agg_by_level('any', axis=axis, level=level,
skipna=skipna)
return self._reduce(nanops.nanany, 'any', axis=axis, skipna=skipna,
numeric_only=bool_only, filter_type='bool')

def all(self, axis=None, bool_only=None, skipna=True, level=None,
**kwargs):
"""
Return whether all elements are True over requested axis.
%(na_action)s

Parameters
----------
axis : {0, 1}
0 for row-wise, 1 for column-wise
skipna : boolean, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a
particular level, collapsing into a DataFrame
bool_only : boolean, default None
Only include boolean data.

Returns
-------
any : Series (or DataFrame if level specified)
"""
if axis is None:
axis = self._stat_axis_number
if level is not None:
return self._agg_by_level('all', axis=axis, level=level,
skipna=skipna)
return self._reduce(nanops.nanall, 'all', axis=axis, skipna=skipna,
numeric_only=bool_only, filter_type='bool')

def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwds):
axis = self._get_axis_number(axis)
Expand Down
52 changes: 52 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3888,6 +3888,7 @@ def _add_numeric_operations(cls):
])
name = (cls._constructor_sliced.__name__
if cls._AXIS_LEN > 1 else 'scalar')

_num_doc = """

%(desc)s
Expand All @@ -3905,6 +3906,27 @@ def _add_numeric_operations(cls):
Include only float, int, boolean data. If None, will attempt to use
everything, then use only numeric data

Returns
-------
%(outname)s : """ + name + " or " + cls.__name__ + " (if level specified)\n"

_bool_doc = """

%(desc)s

Parameters
----------
axis : """ + axis_descr + """
skipna : boolean, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a
particular level, collapsing into a """ + name + """
bool_only : boolean, default None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These logical functions have slightly different documentation: a 'bool_only' field instead of a 'numeric_only' field, and potentially a message about supporting additional numpy arguments via the bool_extended_args variable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is necessary (nor is any other 'bool_extended_args' variable), its just not needed. That said If you have a use case pls show it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to better understand your comment. I mentioned two special cases in the documentation here, the bool_only argument name and the documentation message about supporting additional numpy arguments, which are currently supported in master's Series.any/all, which forwards to numpy's any/all.

  1. bool_only is the preexisting argument name for DataFrame's any and all, in master. Are you suggesting I should change it to numeric_only?
  2. You had suggested earlier that I should continue to support the ndarray.any/all arguments that are supported by Series.any/all in master (these are the 'out' and 'keepdims' parameters). Are you saying that I should not support these, or that I should not document that they are supported?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. maybe I wasn't clear. The signature should be something like: any(laxis, skipna, level, **kwargs); we accept but don't deal explicity with the numpy args (e.g. out) and such. Its not useful, nor is it consistent with how pandas works.

  2. I realized I was confusing about bool_only, yes that is a good idea (just not bool_extended_args

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, got it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't bool_only=True be the default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made bool_only default to None (False) here because:

  • The default is None in DataFrame’s current implementation of any/all in master
  • The corresponding numeric_only argument to sum, mean, etc defaults to None
  • bool_only is not currently implemented in Series._reduce or Panel._reduce. Once we add not implemented errors, they would be thrown in the case of all default arguments, were bool_only to be True by default.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, though I find its a little odd (e.g. I can't think of a reason to use any/all on non-boolean data), but I am sure people will try it!.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess, also, if they're calling any/all on Series in master, they are using the numpy version which does not offer a bool_only option.

Would it make sense for me to create a new issue for implementing bool_only (numeric_only) in Series an Panel, and look into whether it should be enabled by default for any/all?

Include only boolean data. If None, will attempt to use everything,
then use only boolean data

Returns
-------
%(outname)s : """ + name + " or " + cls.__name__ + " (if level specified)\n"
Expand Down Expand Up @@ -3971,6 +3993,36 @@ def stat_func(self, axis=None, skipna=None, level=None,
want the *index* of the minimum, use ``idxmin``. This is the
equivalent of the ``numpy.ndarray`` method ``argmin``.""", nanops.nanmin)

def _make_logical_function(name, desc, f):

@Substitution(outname=name, desc=desc)
@Appender(_bool_doc)
def logical_func(self, axis=None, bool_only=None, skipna=None,
level=None, **kwargs):
if skipna is None:
skipna = True
if axis is None:
axis = self._stat_axis_number
if level is not None:
if bool_only is not None:
raise NotImplementedError(
"Option bool_only is not implemented with option "
"level.")
return self._agg_by_level(name, axis=axis, level=level,
skipna=skipna)
return self._reduce(f, axis=axis, skipna=skipna,
numeric_only=bool_only, filter_type='bool',
name=name)
logical_func.__name__ = name
return logical_func

cls.any = _make_logical_function(
'any', 'Return whether any element is True over requested axis',
nanops.nanany)
cls.all = _make_logical_function(
'all', 'Return whether all elements are True over requested axis',
nanops.nanall)

@Substitution(outname='mad',
desc="Return the mean absolute deviation of the values "
"for the requested axis")
Expand Down
79 changes: 73 additions & 6 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import pandas.index as _index
from pandas.lib import Timestamp, Timedelta, is_datetime_array
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs
from pandas.util.decorators import Appender, cache_readonly, deprecate
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate)
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import (_values_from_object, is_float, is_integer,
Expand Down Expand Up @@ -2088,12 +2089,13 @@ def _evaluate_with_datetime_like(self, other, op, opstr):
def _add_numeric_methods_disabled(cls):
""" add in numeric methods to disable """

def _make_invalid_op(opstr):
def _make_invalid_op(name):

def _invalid_op(self, other=None):
raise TypeError("cannot perform {opstr} with this index type: {typ}".format(opstr=opstr,
typ=type(self)))
return _invalid_op
def invalid_op(self, other=None):
raise TypeError("cannot perform {name} with this index type: {typ}".format(name=name,
typ=type(self)))
invalid_op.__name__ = name
return invalid_op

cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__')
cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__')
Expand Down Expand Up @@ -2178,8 +2180,62 @@ def _evaluate_numeric_unary(self):
cls.__abs__ = _make_evaluate_unary(lambda x: np.abs(x),'__abs__')
cls.__inv__ = _make_evaluate_unary(lambda x: -x,'__inv__')

@classmethod
def _add_logical_methods(cls):
""" add in logical methods """

_doc = """

%(desc)s

Parameters
----------
All arguments to numpy.%(outname)s are accepted.

Returns
-------
%(outname)s : bool or array_like (if axis is specified)
A single element array_like may be converted to bool."""

def _make_logical_function(name, desc, f):

@Substitution(outname=name, desc=desc)
@Appender(_doc)
def logical_func(self, *args, **kwargs):
result = f(self.values)
if isinstance(result, (np.ndarray, com.ABCSeries, Index)) \
and result.ndim == 0:
# return NumPy type
return result.dtype.type(result.item())
else: # pragma: no cover
return result
logical_func.__name__ = name
return logical_func

cls.all = _make_logical_function(
'all', 'Return whether all elements are True', np.all)
cls.any = _make_logical_function(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

much better!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

'any', 'Return whether any element is True', np.any)

@classmethod
def _add_logical_methods_disabled(cls):
""" add in logical methods to disable """

def _make_invalid_op(name):

def invalid_op(self, other=None):
raise TypeError("cannot perform {name} with this index type: {typ}".format(name=name,
typ=type(self)))
invalid_op.__name__ = name
return invalid_op

cls.all = _make_invalid_op('all')
cls.any = _make_invalid_op('any')


Index._add_numeric_methods_disabled()
Index._add_logical_methods()


class NumericIndex(Index):
"""
Expand Down Expand Up @@ -2291,7 +2347,11 @@ def equals(self, other):
def _wrap_joined_index(self, joined, other):
name = self.name if self.name == other.name else None
return Int64Index(joined, name=name)


Int64Index._add_numeric_methods()
Int64Index._add_logical_methods()


class Float64Index(NumericIndex):

Expand Down Expand Up @@ -2483,7 +2543,10 @@ def isin(self, values, level=None):
self._validate_index_level(level)
return lib.ismember_nans(self._array_values(), value_set,
isnull(list(value_set)).any())


Float64Index._add_numeric_methods()
Float64Index._add_logical_methods_disabled()


class MultiIndex(Index):
Expand Down Expand Up @@ -4436,7 +4499,11 @@ def isin(self, values, level=None):
return np.zeros(len(labs), dtype=np.bool_)
else:
return np.lib.arraysetops.in1d(labs, sought_labels)


MultiIndex._add_numeric_methods_disabled()
MultiIndex._add_logical_methods_disabled()


# For utility purposes

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ def test_numeric_compat(self):
"cannot perform __floordiv__",
lambda : 1 // idx)

def test_logical_compat(self):
idx = self.create_index()
tm.assertRaisesRegexp(TypeError,
'cannot perform all',
lambda : idx.all())
tm.assertRaisesRegexp(TypeError,
'cannot perform any',
lambda : idx.any())

def test_boolean_context_compat(self):

# boolean context compat
Expand Down Expand Up @@ -820,6 +829,11 @@ def test_take(self):
expected = self.dateIndex[indexer]
self.assertTrue(result.equals(expected))

def test_logical_compat(self):
idx = self.create_index()
self.assertEqual(idx.all(), idx.values.all())
self.assertEqual(idx.any(), idx.values.any())

def _check_method_works(self, method):
method(self.empty)
method(self.dateIndex)
Expand Down Expand Up @@ -1467,6 +1481,11 @@ def test_equals(self):
self.assertTrue(self.index.equals(same_values))
self.assertTrue(same_values.equals(self.index))

def test_logical_compat(self):
idx = self.create_index()
self.assertEqual(idx.all(), idx.values.all())
self.assertEqual(idx.any(), idx.values.any())

def test_identical(self):
i = Index(self.index.copy())
self.assertTrue(i.identical(self.index))
Expand Down
Loading