diff --git a/doc/redirects.csv b/doc/redirects.csv index 2d72dbca8816f..d7eb86d949665 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -358,7 +358,6 @@ generated/pandas.DataFrame.ewm,../reference/api/pandas.DataFrame.ewm generated/pandas.DataFrame.expanding,../reference/api/pandas.DataFrame.expanding generated/pandas.DataFrame.ffill,../reference/api/pandas.DataFrame.ffill generated/pandas.DataFrame.fillna,../reference/api/pandas.DataFrame.fillna -generated/pandas.DataFrame.filter,../reference/api/pandas.DataFrame.filter generated/pandas.DataFrame.first,../reference/api/pandas.DataFrame.first generated/pandas.DataFrame.first_valid_index,../reference/api/pandas.DataFrame.first_valid_index generated/pandas.DataFrame.floordiv,../reference/api/pandas.DataFrame.floordiv @@ -1023,7 +1022,6 @@ generated/pandas.Series.expanding,../reference/api/pandas.Series.expanding generated/pandas.Series.factorize,../reference/api/pandas.Series.factorize generated/pandas.Series.ffill,../reference/api/pandas.Series.ffill generated/pandas.Series.fillna,../reference/api/pandas.Series.fillna -generated/pandas.Series.filter,../reference/api/pandas.Series.filter generated/pandas.Series.first,../reference/api/pandas.Series.first generated/pandas.Series.first_valid_index,../reference/api/pandas.Series.first_valid_index generated/pandas.Series.floordiv,../reference/api/pandas.Series.floordiv diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e701d48a89db7..cee977e3bbdd3 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -184,7 +184,6 @@ Reindexing / selection / label manipulation DataFrame.drop_duplicates DataFrame.duplicated DataFrame.equals - DataFrame.filter DataFrame.idxmax DataFrame.idxmin DataFrame.reindex @@ -193,6 +192,7 @@ Reindexing / selection / label manipulation DataFrame.rename_axis DataFrame.reset_index DataFrame.sample + DataFrame.select DataFrame.set_axis DataFrame.set_index DataFrame.take diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 6006acc8f5e16..c919deb97f705 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -201,7 +201,7 @@ Reindexing / selection / label manipulation Series.mask Series.add_prefix Series.add_suffix - Series.filter + Series.select Missing data handling --------------------- diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index e85eead4e0f09..8053b0263c827 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -165,7 +165,7 @@ Mutating with User Defined Function (UDF) methods This section applies to pandas methods that take a UDF. In particular, the methods :meth:`DataFrame.apply`, :meth:`DataFrame.aggregate`, :meth:`DataFrame.transform`, and -:meth:`DataFrame.filter`. +:meth:`DataFrame.select`. It is a general rule in programming that one should not mutate a container while it is being iterated over. Mutation will invalidate the iterator, diff --git a/doc/source/user_guide/user_defined_functions.rst b/doc/source/user_guide/user_defined_functions.rst index 6f7fdaddac622..8aa136690ff09 100644 --- a/doc/source/user_guide/user_defined_functions.rst +++ b/doc/source/user_guide/user_defined_functions.rst @@ -84,8 +84,6 @@ User-Defined Functions can be applied across various pandas methods: +-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | :ref:`udf.pipe` | Series or DataFrame | Series or DataFrame | Chain functions together to apply to Series or Dataframe | +-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ -| :ref:`udf.filter` | Series or DataFrame | Boolean | Only accepts UDFs in group by. Function is called for each group, and the group is removed from the result if the function returns ``False`` | -+-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | :ref:`udf.agg` | Series or DataFrame | Scalar or Series | Aggregate and summarizes values, e.g., sum or custom reducer | +-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+ | :ref:`udf.transform` (axis=0) | Column (Series) | Column (Series) | Same as :meth:`apply` with (axis=0), but it raises an exception if the function changes the shape of the data | @@ -261,16 +259,6 @@ calling multiple functions. When to use: Use :meth:`pipe` when you need to create a pipeline of operations and want to keep the code readable and maintainable. -.. _udf.filter: - -:meth:`Series.filter` and :meth:`DataFrame.filter` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``filter`` method is used to select a subset of rows that match certain criteria. -:meth:`Series.filter` and :meth:`DataFrame.filter` do not support user defined functions, -but :meth:`SeriesGroupBy.filter` and :meth:`DataFrameGroupBy.filter` do. You can read more -about ``filter`` in groupby operations in :ref:`groupby.filter`. - .. _udf.agg: :meth:`Series.agg` and :meth:`DataFrame.agg` diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0045fc7b9c221..d05a9b264bcd6 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -715,6 +715,7 @@ Other Deprecations - Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.unstack` and :meth:`DataFrame.unstack` (:issue:`12189`, :issue:`53868`) +- Deprecated :meth:`Series.filter` and :meth:`DataFrame.filter`, renaming these to ``select`` (:issue:`26642`) - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.shift` and :meth:`DataFrame.shift` (:issue:`53802`) - Deprecated option "future.no_silent_downcasting", as it is no longer used. In a future version accessing this option will raise (:issue:`59502`) - Deprecated slicing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` using a ``datetime.date`` object, explicitly cast to :class:`Timestamp` instead (:issue:`35830`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6d703c398f055..cd5c15605ee66 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5520,7 +5520,8 @@ def _reindex_with_indexers( self ) - def filter( + @final + def select( self, items=None, like: str | None = None, @@ -5530,9 +5531,9 @@ def filter( """ Subset the DataFrame or Series according to the specified index labels. - For DataFrame, filter rows or columns depending on ``axis`` argument. - Note that this routine does not filter based on content. - The filter is applied to the labels of the index. + For DataFrame, select rows or columns depending on ``axis`` argument. + Note that this routine does not select based on content. + The select is applied to the labels of the index. Parameters ---------- @@ -5551,7 +5552,7 @@ def filter( Returns ------- Same type as caller - The filtered subset of the DataFrame or Series. + The selected subset of the DataFrame or Series. See Also -------- @@ -5579,22 +5580,54 @@ def filter( rabbit 4 5 6 >>> # select columns by name - >>> df.filter(items=["one", "three"]) + >>> df.select(items=["one", "three"]) one three mouse 1 3 rabbit 4 6 >>> # select columns by regular expression - >>> df.filter(regex="e$", axis=1) + >>> df.select(regex="e$", axis=1) one three mouse 1 3 rabbit 4 6 >>> # select rows containing 'bbi' - >>> df.filter(like="bbi", axis=0) + >>> df.select(like="bbi", axis=0) one two three rabbit 4 5 6 """ + + return self._filter(items=items, like=like, regex=regex, axis=axis) + + @final + def filter( + self, + items=None, + like: str | None = None, + regex: str | None = None, + axis: Axis | None = None, + ) -> Self: + """ + Use obj.select instead. + + .. deprecated:: 3.0.0 + """ + warnings.warn( + f"{type(self).__name__}.filter is deprecated and will be removed " + "in a future version. Use obj.select instead.", + Pandas4Warning, + stacklevel=find_stack_level(), + ) + return self._filter(items=items, like=like, regex=regex, axis=axis) + + @final + def _filter( + self, + items=None, + like: str | None = None, + regex: str | None = None, + axis: Axis | None = None, + ) -> Self: nkw = common.count_not_none(items, like, regex) if nkw > 1: raise TypeError( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d279594617235..f77b1b9b851d3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -756,7 +756,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): See Also -------- - Series.filter: Filter elements of ungrouped Series. + Series.select : Select elements of ungrouped Series. DataFrameGroupBy.filter : Filter elements from groups base on criterion. Notes @@ -2380,7 +2380,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: See Also -------- - DataFrame.filter: Filter elements of ungrouped DataFrame. + DataFrame.select: Select elements of ungrouped DataFrame. SeriesGroupBy.filter : Filter elements from groups base on criterion. Notes diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 8abecd13c7038..d87b0cba70cc0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -365,14 +365,14 @@ def test_select_dtypes(): @pytest.mark.parametrize( - "filter_kwargs", [{"items": ["a"]}, {"like": "a"}, {"regex": "a"}] + "select_kwargs", [{"items": ["a"]}, {"like": "a"}, {"regex": "a"}] ) -def test_filter(filter_kwargs): - # Case: selecting columns using `filter()` returns a new dataframe +def test_select(select_kwargs): + # Case: selecting columns using `select_kwargs()` returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() - df2 = df.filter(**filter_kwargs) + df2 = df.select(**select_kwargs) assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index dc84e2adf1239..6f8be5f97724e 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -1,40 +1,42 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + import pandas as pd from pandas import DataFrame import pandas._testing as tm -class TestDataFrameFilter: - def test_filter(self, float_frame, float_string_frame): +class TestDataFrameSelect: + def test_select(self, float_frame, float_string_frame): # Items - filtered = float_frame.filter(["A", "B", "E"]) - assert len(filtered.columns) == 2 - assert "E" not in filtered + selected = float_frame.select(["A", "B", "E"]) + assert len(selected.columns) == 2 + assert "E" not in selected - filtered = float_frame.filter(["A", "B", "E"], axis="columns") - assert len(filtered.columns) == 2 - assert "E" not in filtered + selected = float_frame.select(["A", "B", "E"], axis="columns") + assert len(selected.columns) == 2 + assert "E" not in selected # Other axis idx = float_frame.index[0:4] - filtered = float_frame.filter(idx, axis="index") + selected = float_frame.select(idx, axis="index") expected = float_frame.reindex(index=idx) - tm.assert_frame_equal(filtered, expected) + tm.assert_frame_equal(selected, expected) # like fcopy = float_frame.copy() fcopy["AA"] = 1 - filtered = fcopy.filter(like="A") - assert len(filtered.columns) == 2 - assert "AA" in filtered + selected = fcopy.select(like="A") + assert len(selected.columns) == 2 + assert "AA" in selected # like with ints in column names df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) - filtered = df.filter(like="_") - assert len(filtered.columns) == 2 + selected = df.select(like="_") + assert len(selected.columns) == 2 # regex with ints in column names # from PR #10384 @@ -42,58 +44,58 @@ def test_filter(self, float_frame, float_string_frame): expected = DataFrame( 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) ) - filtered = df.filter(regex="^[0-9]+$") - tm.assert_frame_equal(filtered, expected) + selected = df.select(regex="^[0-9]+$") + tm.assert_frame_equal(selected, expected) expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) # shouldn't remove anything - filtered = expected.filter(regex="^[0-9]+$") - tm.assert_frame_equal(filtered, expected) + selected = expected.select(regex="^[0-9]+$") + tm.assert_frame_equal(selected, expected) # pass in None with pytest.raises(TypeError, match="Must pass"): - float_frame.filter() + float_frame.select() with pytest.raises(TypeError, match="Must pass"): - float_frame.filter(items=None) + float_frame.select(items=None) with pytest.raises(TypeError, match="Must pass"): - float_frame.filter(axis=1) + float_frame.select(axis=1) # test mutually exclusive arguments with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], regex="e$", like="bbi") + float_frame.select(items=["one", "three"], regex="e$", like="bbi") with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], regex="e$", axis=1) + float_frame.select(items=["one", "three"], regex="e$", axis=1) with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], regex="e$") + float_frame.select(items=["one", "three"], regex="e$") with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], like="bbi", axis=0) + float_frame.select(items=["one", "three"], like="bbi", axis=0) with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], like="bbi") + float_frame.select(items=["one", "three"], like="bbi") # objects - filtered = float_string_frame.filter(like="foo") - assert "foo" in filtered + selected = float_string_frame.select(like="foo") + assert "foo" in selected # unicode columns, won't ascii-encode df = float_frame.rename(columns={"B": "\u2202"}) - filtered = df.filter(like="C") - assert "C" in filtered + selected = df.select(like="C") + assert "C" in selected - def test_filter_regex_search(self, float_frame): + def test_select_regex_search(self, float_frame): fcopy = float_frame.copy() fcopy["AA"] = 1 # regex - filtered = fcopy.filter(regex="[A]+") - assert len(filtered.columns) == 2 - assert "AA" in filtered + selected = fcopy.select(regex="[A]+") + assert len(selected.columns) == 2 + assert "AA" in selected # doesn't have to be at beginning df = DataFrame( {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} ) - result = df.filter(regex="BB") + result = df.select(regex="BB") exp = df[[x for x in df.columns if "BB" in x]] tm.assert_frame_equal(result, exp) @@ -104,50 +106,62 @@ def test_filter_regex_search(self, float_frame): ("あ", {"あ": [3, 4]}), ], ) - def test_filter_unicode(self, name, expected_data): + def test_select_unicode(self, name, expected_data): # GH13101 df = DataFrame({"a": [1, 2], "あ": [3, 4]}) expected = DataFrame(expected_data) - tm.assert_frame_equal(df.filter(like=name), expected) - tm.assert_frame_equal(df.filter(regex=name), expected) + tm.assert_frame_equal(df.select(like=name), expected) + tm.assert_frame_equal(df.select(regex=name), expected) - def test_filter_bytestring(self): + def test_select_bytestring(self): # GH13101 name = "a" df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) expected = DataFrame({b"a": [1, 2]}) - tm.assert_frame_equal(df.filter(like=name), expected) - tm.assert_frame_equal(df.filter(regex=name), expected) + tm.assert_frame_equal(df.select(like=name), expected) + tm.assert_frame_equal(df.select(regex=name), expected) - def test_filter_corner(self): + def test_select_corner(self): empty = DataFrame() - result = empty.filter([]) + result = empty.select([]) tm.assert_frame_equal(result, empty) - result = empty.filter(like="foo") + result = empty.select(like="foo") tm.assert_frame_equal(result, empty) - def test_filter_regex_non_string(self): - # GH#5798 trying to filter on non-string columns should drop, + def test_select_regex_non_string(self): + # GH#5798 trying to select on non-string columns should drop, # not raise df = DataFrame(np.random.default_rng(2).random((3, 2)), columns=["STRING", 123]) - result = df.filter(regex="STRING") + result = df.select(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) - def test_filter_keep_order(self): + def test_select_keep_order(self): # GH#54980 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - result = df.filter(items=["B", "A"]) + result = df.select(items=["B", "A"]) expected = df[["B", "A"]] tm.assert_frame_equal(result, expected) - def test_filter_different_dtype(self): + def test_select_different_dtype(self): # GH#54980 df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) - result = df.filter(items=["B", "A"]) + result = df.select(items=["B", "A"]) expected = df[[]] tm.assert_frame_equal(result, expected) + + def test_filter_deprecated(self): + # GH#26642 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + msg = "DataFrame.filter is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + df.filter(items=["B", "A"]) + + ser = df[1] + msg = "Series.filter is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + ser.filter([0, 1])