From 477ec1ae0b8475271a1ec2eecefc51c80699fcff Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 11 Mar 2014 01:47:35 -0700 Subject: [PATCH] CLN/API refactor drop and filter and depreciate select API filter and drop take same arguments --- doc/source/v0.14.0.txt | 1 + pandas/core/generic.py | 191 +++++++++++++++++--------------- pandas/core/panelnd.py | 2 +- pandas/tests/test_frame.py | 6 +- pandas/tests/test_multilevel.py | 8 +- pandas/tests/test_series.py | 4 +- 6 files changed, 114 insertions(+), 98 deletions(-) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 8443fad889601..ee11f1bd3e391 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -484,6 +484,7 @@ Enhancements - ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`) - ``pd.read_clipboard`` will, if 'sep' is unspecified, try to detect data copied from a spreadsheet and parse accordingly. (:issue:`6223`) +- Unify drop and select API, allow drop to take a regex (:issue:`4818`) and drop with a boolean mask (:issue:`6189`) - Joining a singly-indexed DataFrame with a multi-indexed DataFrame (:issue:`3662`) See :ref:`the docs`. Joining multi-index DataFrames on both the left and right is not yet supported ATM. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 01af7534d458d..7f38dd6f0db88 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1377,33 +1377,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): _xs = xs - # TODO: Check if this was clearer in 0.12 - def select(self, crit, axis=0): - """ - Return data corresponding to axis labels matching criteria - - Parameters - ---------- - crit : function - To be called on each index (label). Should return True or False - axis : int - - Returns - ------- - selection : type of caller - """ - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis) - - if len(axis_values) > 0: - new_axis = axis_values[ - np.asarray([bool(crit(label)) for label in axis_values])] - else: - new_axis = axis_values - - return self.reindex(**{axis_name: new_axis}) - def reindex_like(self, other, method=None, copy=True, limit=None): """ return an object with matching indicies to myself @@ -1427,55 +1400,32 @@ def reindex_like(self, other, method=None, copy=True, limit=None): d = other._construct_axes_dict(method=method, copy=copy, limit=limit) return self.reindex(**d) - def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): + def drop(self, labels, axis=0, level=None, inplace=False, + regex=True, **kwargs): """ Return new object with labels in requested axis removed Parameters ---------- - labels : single label or list-like + labels : Either function, regex or list-like + Boolean function to be called on each index (label) + Regular expression to be tested against each index + List of info axis to restrict to axis : int or axis name level : int or level name, default None For MultiIndex inplace : bool, default False If True, do operation inplace and return None. + regex : string or False + If a string, which string methods to use for selection, + Can be 'match', 'contains', 'search' Returns ------- dropped : type of caller - """ - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) - axis, axis_ = self._get_axis(axis), axis - - if axis.is_unique: - if level is not None: - if not isinstance(axis, MultiIndex): - raise AssertionError('axis must be a MultiIndex') - new_axis = axis.drop(labels, level=level) - else: - new_axis = axis.drop(labels) - dropped = self.reindex(**{axis_name: new_axis}) - try: - dropped.axes[axis_].set_names(axis.names, inplace=True) - except AttributeError: - pass - result = dropped - - else: - labels = com._index_labels_to_array(labels) - if level is not None: - if not isinstance(axis, MultiIndex): - raise AssertionError('axis must be a MultiIndex') - indexer = ~lib.ismember(axis.get_level_values(level), - set(labels)) - else: - indexer = ~axis.isin(labels) - slicer = [slice(None)] * self.ndim - slicer[self._get_axis_number(axis_name)] = indexer - - result = self.ix[tuple(slicer)] + """ + result = self._select(labels, axis, level=level, regex=regex, negate=True) if inplace: self._update_inplace(result) @@ -1737,44 +1687,109 @@ def _reindex_axis(self, new_index, fill_method, axis, copy): else: return self._constructor(new_data).__finalize__(self) - def filter(self, items=None, like=None, regex=None, axis=None): + def filter(self, labels=None, axis=None, level=None, inplace=False, + regex=True, **kwargs): """ Restrict the info axis to set of items or wildcard Parameters ---------- - items : list-like - List of info axis to restrict to (must not all be present) - like : string - Keep info axis where "arg in col == True" - regex : string (regular expression) - Keep info axis with re.search(regex, col) == True + labels : Either function, regex or list-like + Boolean function to be called on each index (label) + Regular expression to be tested against each index + List of info axis to restrict to - Notes - ----- - Arguments are mutually exclusive, but this is not checked for + axis : int + level : int or level name, default None + For MultiIndex + inplace : bool, default False + If True, do operation inplace and return None. + regex : string or False + If a string, which string methods to use for selection, + Can be 'match', 'contains', 'search' + + TODO actually we can do contains more efficiently without regex + using list comprehension, so really these mutually exclusive + whether these regex / "kind"... ?? + + """ + while kwargs: + items = kwargs.pop('items', None) + if items is not None: + return self.filter(items, axis=axis, level=level, inplace=inplace, regex=False) + like = kwargs.pop('like', None) + if like is not None: + return self.filter(like, regex='match', axis=axis, level=level, inplace=inplace) + # if you're here you've passed an unknown arg + raise TypeError("unknown kwargs passed: %s" % ', '.join(kwargs)) + + if labels is None: + if isinstance(regex, string_types) and regex != 'match': + # slight break in old behaviour if regex == 'match' + return self.filter(regex, regex='contains', axis=axis, + level=level, inplace=inplace) + raise TypeError("labels argument must not be None") + + result = self._select(labels, axis, level=level, regex=regex) - """ - import re + if inplace: + self._update_inplace(result) + else: + return result + def _select(self, labels, axis, level=None, regex=True, negate=False): if axis is None: axis = self._info_axis_name - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis_name) - if items is not None: - return self.reindex(**{axis_name: [r for r in items - if r in axis_values]}) - elif like: - matchf = lambda x: (like in x if isinstance(x, string_types) - else like in str(x)) - return self.select(matchf, axis=axis_name) - elif regex: - matcher = re.compile(regex) - return self.select(lambda x: matcher.search(x) is not None, - axis=axis_name) - else: - raise TypeError('Must pass either `items`, `like`, or `regex`') + axis_number = self._get_axis_number(axis) + axis_values = self._get_axis(axis_number) + + if level is not None: + axis_values = axis_values.get_level_values(level) + + if hasattr(labels, '__call__'): + msk = axis_values.map(labels).astype(bool) + + elif isinstance(labels, string_types): + if level is None: + axis_values = axis_values.get_level_values(0) + + if not regex: + msk = axis_values == labels + else: + from pandas.core.strings import str_contains + msk = str_contains(axis_values, labels, na=False) + + elif not hasattr(labels, '__iter__'): + msk = axis_values == labels + + else: # is list-like + if isinstance(axis_values, MultiIndex): + if isinstance(labels, tuple): + # hack for dropping single col with tuple + labels = [labels] + elif level is None: + # use level=0 if None passed, warn? + level = 0 + msk = axis_values.isin(labels) + + if negate: # aka drop + msk = ~msk + + tuple_indexer = [slice(None)] * self.ndim + tuple_indexer[axis_number] = msk + try: + return self.iloc[tuple(tuple_indexer)] + except IndexError: # can happen with sparse when no _data + axis_name = self._get_axis_name(axis_number) + return self.reindex(**{axis_name: axis_values[msk]}) + + def select(self, crit, axis=0): + """ + depreciated, alias for filter + + """ + return self.filter(crit, axis=axis) def head(self, n=5): """ diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 3eebd51190e3d..8a813d3e8bb97 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -96,7 +96,7 @@ def _combine_with_constructor(self, other, func): klass._combine_with_constructor = _combine_with_constructor # set as NonImplemented operations which we don't support - for f in ['to_frame', 'to_excel', 'to_sparse', 'groupby', 'join', 'filter', + for f in ['to_frame', 'to_excel', 'to_sparse', 'groupby', 'join', 'dropna', 'shift']: def func(self, *args, **kwargs): raise NotImplementedError diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index fc68449f75e0f..627fb5d55eed2 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3286,7 +3286,7 @@ def check(result, expected=None): result = df.drop(['a'],axis=1) expected = DataFrame([[1],[1],[1]],columns=['bar']) check(result,expected) - result = df.drop('a',axis=1) + result = df.drop('a', axis=1, regex=False) check(result,expected) # describe @@ -9717,8 +9717,8 @@ def test_filter(self): self.assertEqual(len(filtered.columns), 2) # pass in None - with assertRaisesRegexp(TypeError, 'Must pass'): - self.frame.filter(items=None) + with assertRaisesRegexp(TypeError, 'must not be None'): + self.frame.filter(labels=None) # objects filtered = self.mixed_frame.filter(like='foo') diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a6c2bb9f56602..dbf714b094e5b 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1587,11 +1587,11 @@ def test_mixed_depth_drop(self): index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) - result = df.drop('a', axis=1) + result = df.drop('a', axis=1, level=0) expected = df.drop([('a', '', '')], axis=1) assert_frame_equal(expected, result) - result = df.drop(['top'], axis=1) + result = df.drop(['top'], axis=1, level=0) expected = df.drop([('top', 'OD', 'wx')], axis=1) expected = expected.drop([('top', 'OD', 'wy')], axis=1) assert_frame_equal(expected, result) @@ -1601,7 +1601,7 @@ def test_mixed_depth_drop(self): assert_frame_equal(expected, result) expected = df.drop([('top', 'OD', 'wy')], axis=1) - expected = df.drop('top', axis=1) + expected = df.drop('top', axis=1, level=0) result = df.drop('result1', level=1, axis=1) expected = df.drop([('routine1', 'result1', ''), @@ -1647,7 +1647,7 @@ def test_mixed_depth_pop(self): self.assertEquals(result.name, 'a') expected = df1['top'] - df1 = df1.drop(['top'], axis=1) + df1 = df1.drop(['top'], axis=1, level=0) result = df2.pop('top') assert_frame_equal(expected, result) assert_frame_equal(df1, df2) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 839804be4437c..8286783bbc4dc 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1634,8 +1634,8 @@ def test_drop(self): # single string/tuple-like s = Series(range(3),index=list('abc')) - self.assertRaises(ValueError, s.drop, 'bc') - self.assertRaises(ValueError, s.drop, ('a',)) + assert_series_equal(s.drop('bc'), s) + assert_series_equal(s.drop(('a',)), s.loc[['b', 'c']]) # bad axis self.assertRaises(ValueError, s.drop, 'one', axis='columns')