-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
CLN/API refactor drop and filter and deprecate select #6599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1377,33 +1377,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): | |
|
||
_xs = xs | ||
|
||
# TODO: Check if this was clearer in 0.12 | ||
def select(self, crit, axis=0): | ||
""" | ||
Return data corresponding to axis labels matching criteria | ||
|
||
Parameters | ||
---------- | ||
crit : function | ||
To be called on each index (label). Should return True or False | ||
axis : int | ||
|
||
Returns | ||
------- | ||
selection : type of caller | ||
""" | ||
axis = self._get_axis_number(axis) | ||
axis_name = self._get_axis_name(axis) | ||
axis_values = self._get_axis(axis) | ||
|
||
if len(axis_values) > 0: | ||
new_axis = axis_values[ | ||
np.asarray([bool(crit(label)) for label in axis_values])] | ||
else: | ||
new_axis = axis_values | ||
|
||
return self.reindex(**{axis_name: new_axis}) | ||
|
||
def reindex_like(self, other, method=None, copy=True, limit=None): | ||
""" return an object with matching indicies to myself | ||
|
||
|
@@ -1427,55 +1400,32 @@ def reindex_like(self, other, method=None, copy=True, limit=None): | |
d = other._construct_axes_dict(method=method, copy=copy, limit=limit) | ||
return self.reindex(**d) | ||
|
||
def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): | ||
def drop(self, labels, axis=0, level=None, inplace=False, | ||
regex=True, **kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. default of |
||
""" | ||
Return new object with labels in requested axis removed | ||
|
||
Parameters | ||
---------- | ||
labels : single label or list-like | ||
labels : Either function, regex or list-like | ||
Boolean function to be called on each index (label) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you put bullet points |
||
Regular expression to be tested against each index | ||
List of info axis to restrict to | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe 'list of info axis labels to restrict to'? (that makes it a bit clearer I think) + a single string should also be allowed! (not only a list-like) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and actually it should be 'list of info axis labels to drop' instead of to 'restrict to'? |
||
axis : int or axis name | ||
level : int or level name, default None | ||
For MultiIndex | ||
inplace : bool, default False | ||
If True, do operation inplace and return None. | ||
regex : string or False | ||
If a string, which string methods to use for selection, | ||
Can be 'match', 'contains', 'search' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the default? Why only string of False, and not True? |
||
|
||
Returns | ||
------- | ||
dropped : type of caller | ||
""" | ||
axis = self._get_axis_number(axis) | ||
axis_name = self._get_axis_name(axis) | ||
axis, axis_ = self._get_axis(axis), axis | ||
|
||
if axis.is_unique: | ||
if level is not None: | ||
if not isinstance(axis, MultiIndex): | ||
raise AssertionError('axis must be a MultiIndex') | ||
new_axis = axis.drop(labels, level=level) | ||
else: | ||
new_axis = axis.drop(labels) | ||
dropped = self.reindex(**{axis_name: new_axis}) | ||
try: | ||
dropped.axes[axis_].set_names(axis.names, inplace=True) | ||
except AttributeError: | ||
pass | ||
result = dropped | ||
|
||
else: | ||
labels = com._index_labels_to_array(labels) | ||
if level is not None: | ||
if not isinstance(axis, MultiIndex): | ||
raise AssertionError('axis must be a MultiIndex') | ||
indexer = ~lib.ismember(axis.get_level_values(level), | ||
set(labels)) | ||
else: | ||
indexer = ~axis.isin(labels) | ||
|
||
slicer = [slice(None)] * self.ndim | ||
slicer[self._get_axis_number(axis_name)] = indexer | ||
|
||
result = self.ix[tuple(slicer)] | ||
""" | ||
result = self._select(labels, axis, level=level, regex=regex, negate=True) | ||
|
||
if inplace: | ||
self._update_inplace(result) | ||
|
@@ -1737,44 +1687,109 @@ def _reindex_axis(self, new_index, fill_method, axis, copy): | |
else: | ||
return self._constructor(new_data).__finalize__(self) | ||
|
||
def filter(self, items=None, like=None, regex=None, axis=None): | ||
def filter(self, labels=None, axis=None, level=None, inplace=False, | ||
regex=True, **kwargs): | ||
""" | ||
Restrict the info axis to set of items or wildcard | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is 'wildcard'? |
||
|
||
Parameters | ||
---------- | ||
items : list-like | ||
List of info axis to restrict to (must not all be present) | ||
like : string | ||
Keep info axis where "arg in col == True" | ||
regex : string (regular expression) | ||
Keep info axis with re.search(regex, col) == True | ||
labels : Either function, regex or list-like | ||
Boolean function to be called on each index (label) | ||
Regular expression to be tested against each index | ||
List of info axis to restrict to | ||
|
||
Notes | ||
----- | ||
Arguments are mutually exclusive, but this is not checked for | ||
axis : int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add in an explanation sentence something like 'Filter on index rows (0) or columns (1)'? (I think this is something people always forget, so handy to see it reminded in the docstring) |
||
level : int or level name, default None | ||
For MultiIndex | ||
inplace : bool, default False | ||
If True, do operation inplace and return None. | ||
regex : string or False | ||
If a string, which string methods to use for selection, | ||
Can be 'match', 'contains', 'search' | ||
|
||
TODO actually we can do contains more efficiently without regex | ||
using list comprehension, so really these mutually exclusive | ||
whether these regex / "kind"... ?? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should still be cleaned up :-) |
||
|
||
""" | ||
while kwargs: | ||
items = kwargs.pop('items', None) | ||
if items is not None: | ||
return self.filter(items, axis=axis, level=level, inplace=inplace, regex=False) | ||
like = kwargs.pop('like', None) | ||
if like is not None: | ||
return self.filter(like, regex='match', axis=axis, level=level, inplace=inplace) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should prepend There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I don't think Eg:
Also, if this wasn't catched by the tests, then there should really added some ... |
||
# if you're here you've passed an unknown arg | ||
raise TypeError("unknown kwargs passed: %s" % ', '.join(kwargs)) | ||
|
||
if labels is None: | ||
if isinstance(regex, string_types) and regex != 'match': | ||
# slight break in old behaviour if regex == 'match' | ||
return self.filter(regex, regex='contains', axis=axis, | ||
level=level, inplace=inplace) | ||
raise TypeError("labels argument must not be None") | ||
|
||
result = self._select(labels, axis, level=level, regex=regex) | ||
|
||
""" | ||
import re | ||
if inplace: | ||
self._update_inplace(result) | ||
else: | ||
return result | ||
|
||
def _select(self, labels, axis, level=None, regex=True, negate=False): | ||
if axis is None: | ||
axis = self._info_axis_name | ||
axis_name = self._get_axis_name(axis) | ||
axis_values = self._get_axis(axis_name) | ||
|
||
if items is not None: | ||
return self.reindex(**{axis_name: [r for r in items | ||
if r in axis_values]}) | ||
elif like: | ||
matchf = lambda x: (like in x if isinstance(x, string_types) | ||
else like in str(x)) | ||
return self.select(matchf, axis=axis_name) | ||
elif regex: | ||
matcher = re.compile(regex) | ||
return self.select(lambda x: matcher.search(x) is not None, | ||
axis=axis_name) | ||
else: | ||
raise TypeError('Must pass either `items`, `like`, or `regex`') | ||
axis_number = self._get_axis_number(axis) | ||
axis_values = self._get_axis(axis_number) | ||
|
||
if level is not None: | ||
axis_values = axis_values.get_level_values(level) | ||
|
||
if hasattr(labels, '__call__'): | ||
msk = axis_values.map(labels).astype(bool) | ||
|
||
elif isinstance(labels, string_types): | ||
if level is None: | ||
axis_values = axis_values.get_level_values(0) | ||
|
||
if not regex: | ||
msk = axis_values == labels | ||
else: | ||
from pandas.core.strings import str_contains | ||
msk = str_contains(axis_values, labels, na=False) | ||
|
||
elif not hasattr(labels, '__iter__'): | ||
msk = axis_values == labels | ||
|
||
else: # is list-like | ||
if isinstance(axis_values, MultiIndex): | ||
if isinstance(labels, tuple): | ||
# hack for dropping single col with tuple | ||
labels = [labels] | ||
elif level is None: | ||
# use level=0 if None passed, warn? | ||
level = 0 | ||
msk = axis_values.isin(labels) | ||
|
||
if negate: # aka drop | ||
msk = ~msk | ||
|
||
tuple_indexer = [slice(None)] * self.ndim | ||
tuple_indexer[axis_number] = msk | ||
try: | ||
return self.iloc[tuple(tuple_indexer)] | ||
except IndexError: # can happen with sparse when no _data | ||
axis_name = self._get_axis_name(axis_number) | ||
return self.reindex(**{axis_name: axis_values[msk]}) | ||
|
||
def select(self, crit, axis=0): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe need a deprecation warning? |
||
""" | ||
depreciated, alias for filter | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe just add the original docstring back? When people get the deprecation warning, it can still be handy to see what the old function exactly did (and so your old code exactly did to see what you have to change) |
||
""" | ||
return self.filter(crit, axis=axis) | ||
|
||
def head(self, n=5): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3286,7 +3286,7 @@ def check(result, expected=None): | |
result = df.drop(['a'],axis=1) | ||
expected = DataFrame([[1],[1],[1]],columns=['bar']) | ||
check(result,expected) | ||
result = df.drop('a',axis=1) | ||
result = df.drop('a', axis=1, regex=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should really not be necessary to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 yep, hadn't realised the state of this. went through some back and forth. |
||
check(result,expected) | ||
|
||
# describe | ||
|
@@ -9717,8 +9717,8 @@ def test_filter(self): | |
self.assertEqual(len(filtered.columns), 2) | ||
|
||
# pass in None | ||
with assertRaisesRegexp(TypeError, 'Must pass'): | ||
self.frame.filter(items=None) | ||
with assertRaisesRegexp(TypeError, 'must not be None'): | ||
self.frame.filter(labels=None) | ||
|
||
# objects | ||
filtered = self.mixed_frame.filter(like='foo') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1587,11 +1587,11 @@ def test_mixed_depth_drop(self): | |
index = MultiIndex.from_tuples(tuples) | ||
df = DataFrame(randn(4, 6), columns=index) | ||
|
||
result = df.drop('a', axis=1) | ||
result = df.drop('a', axis=1, level=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is some hoop jumping here before (so if you don't pass level it uses the top one), I will turn that back on. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it needed to add |
||
expected = df.drop([('a', '', '')], axis=1) | ||
assert_frame_equal(expected, result) | ||
|
||
result = df.drop(['top'], axis=1) | ||
result = df.drop(['top'], axis=1, level=0) | ||
expected = df.drop([('top', 'OD', 'wx')], axis=1) | ||
expected = expected.drop([('top', 'OD', 'wy')], axis=1) | ||
assert_frame_equal(expected, result) | ||
|
@@ -1601,7 +1601,7 @@ def test_mixed_depth_drop(self): | |
assert_frame_equal(expected, result) | ||
|
||
expected = df.drop([('top', 'OD', 'wy')], axis=1) | ||
expected = df.drop('top', axis=1) | ||
expected = df.drop('top', axis=1, level=0) | ||
|
||
result = df.drop('result1', level=1, axis=1) | ||
expected = df.drop([('routine1', 'result1', ''), | ||
|
@@ -1647,7 +1647,7 @@ def test_mixed_depth_pop(self): | |
self.assertEquals(result.name, 'a') | ||
|
||
expected = df1['top'] | ||
df1 = df1.drop(['top'], axis=1) | ||
df1 = df1.drop(['top'], axis=1, level=0) | ||
result = df2.pop('top') | ||
assert_frame_equal(expected, result) | ||
assert_frame_equal(df1, df2) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should be 'drop and filter' + deprecation of filter should be added