diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e3b75afcf945e..81172e037f388 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -563,6 +563,39 @@ without using a temporary variable. (bb.groupby(['year', 'team']).sum() .loc[lambda df: df.r > 100]) + +.. _indexing.selecting_with_regex: + +Selection by regular expression +------------------------------- + +.. versionadded:: 1.0 + +it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by +row/columns axis labels that match a regular expression pattern. + +.. ipython:: python + + df_re = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) + df_re + + df_re.loc(regex=True)["B", "B"] + df_re.loc(axis=1, regex=True)["B"] + +The regex matching will only work when looking up single strings, not list of strings etc. + +.. ipython:: python + + df_re.loc(regex=True)[["A"], "A"] + +*Notice*: Is is currently not possible to set values for a given regular expression. + +.. ipython:: python + :okexcept: + + df_re.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] + + .. _indexing.deprecate_ix: IX indexer is deprecated diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa7b945492d5d..de0c396721b6b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,6 +21,36 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_1000.enhancements.loc_regex: + +Selection using regular expressions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by +row/columns axis labels that match a regular expression pattern. + +.. ipython:: python + + df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) + df + + df.loc(regex=True)["B", "B"] + df.loc(axis=1, regex=True)["B"] + +The regex matching will only work when looking up single strings, not list of strings etc. + +.. ipython:: python + + df.loc(regex=True)[["A"], "A"] + +Is is currently not possible to set values for a given regular expression. + +.. ipython:: python + :okexcept: + + df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] + + .. _whatsnew_1000.enhancements.other: - diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1d4ea54ef0d70..6ba07831156c5 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,3 +1,4 @@ +import re import textwrap from typing import Tuple import warnings @@ -25,6 +26,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna +from pandas._typing import FrameOrSeries import pandas.core.common as com from pandas.core.index import Index, InvalidIndexError, MultiIndex from pandas.core.indexers import is_list_like_indexer, length_of_indexer @@ -1182,7 +1184,7 @@ def _validate_read_indexer( ) if not (ax.is_categorical() or ax.is_interval()): - warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) + warnings.warn(_missing_key_warning, FutureWarning, stacklevel=7) def _convert_to_indexer( self, obj, axis: int, is_setter: bool = False, raise_missing: bool = False @@ -1467,6 +1469,12 @@ class _LocIndexer(_LocationIndexer): - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) + ``.loc`` can be called before selecting using these parameters: + + - ``axis``, to select by a single axis on a DataFrame, e.g. ``.loc(axis=1)['a']``. + - ``regex``, to let single strings be interpreted as regex patterns, e.g. + ``.loc(regex=True)[:, '^col_']`` + See more at :ref:`Selection by Label ` Raises @@ -1546,6 +1554,21 @@ class _LocIndexer(_LocationIndexer): max_speed shield sidewinder 7 8 + The axis may be preselected + + >>> df.loc(axis=1)["max_speed"] + cobra 1 + viper 4 + sidewinder 7 + Name: max_speed, dtype: int64 + + Single strings are considered regex patterns if ``regex=True`` + + >>> df.loc(regex=True)["r$", "d$"] + max_speed shield + viper 4 5 + sidewinder 7 8 + **Setting values** Set value for all items matching the list of labels @@ -1689,6 +1712,19 @@ class _LocIndexer(_LocationIndexer): ) _exception = KeyError + regex = False + + def __call__(self, axis=None, regex=False): + new_self = super().__call__(axis=axis) + if regex: + new_self.regex = regex + return new_self + + def __getitem__(self, key): + if self.regex: + return self._getitem_regex(key) + return super().__getitem__(key) + @Appender(_NDFrameIndexer._validate_key.__doc__) def _validate_key(self, key, axis: int): @@ -1755,6 +1791,36 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key + def _getitem_regex(self, key, axis=None): + """Subset obj by regex-searching axis for key.""" + if isinstance(key, str): + if axis is None: + axis = self.axis or 0 + return self._get_regex_axis(self.obj, pattern=key, axis=axis) + elif isinstance(key, tuple): + assert len(key) == 2 + result = self.obj # type: ABCDataFrame + # slicing columns first, then index is typically faster + for ax, sub_key in zip([1, 0], reversed(key)): + if isinstance(sub_key, str): + result = self._get_regex_axis(result, pattern=sub_key, axis=ax) + else: + result = result.loc(axis=ax)[sub_key] + return result + + def _get_regex_axis( + self, obj: FrameOrSeries, pattern: str, axis: int + ) -> FrameOrSeries: + """Subset a single axis of ``obj`` from a regex pattern.""" + labels = obj._get_axis(axis) + matcher = re.compile(pattern) + + def func(x): + return matcher.search(x) is not None + + mapped = labels.map(func) + return obj.loc(axis=axis)[mapped] + def _getitem_axis(self, key, axis: int): key = item_from_zerodim(key) if is_iterator(key): @@ -1816,10 +1882,18 @@ def _getitem_axis(self, key, axis: int): indexer[axis] = locs return self.obj.iloc[tuple(indexer)] + elif self.regex and isinstance(key, (str, bytes)): + return self._getitem_regex(key, axis=axis) + # fall thru to straight lookup self._validate_key(key, axis) return self._get_label(key, axis=axis) + def __setitem__(self, key, value): + if self.regex: + raise NotImplementedError("Inserting with regex has not been implemented") + return super().__setitem__(key, value) + class _iLocIndexer(_LocationIndexer): """ diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7..2f8cb6eb1401a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,6 +13,67 @@ from pandas.util import testing as tm +class TestLocRegex: + # test calls to df.loc, with a regex parameter set to True, + # ie. df.loc(..., regex=True)[...] + + idx = ["AB", "BC", "CD", "DE"] + cols = idx[::-1] + + def test_regex_frame(self): + idx, cols = self.idx, self.cols + + df = pd.DataFrame(1, index=idx, columns=cols) + ser = df["AB"] + + result = df.loc(regex=True)["B"] + expected = pd.DataFrame(1, index=["AB", "BC"], columns=cols) + tm.assert_frame_equal(result, expected) + + result = ser.loc(regex=True)["B"] + expected = pd.Series(1, index=["AB", "BC"], name="AB") + tm.assert_series_equal(result, expected) + + result = df.loc(regex=True)[:, "B"] + expected = pd.DataFrame(1, index=idx, columns=["BC", "AB"]) + tm.assert_frame_equal(result, expected) + + result = df.loc(regex=True)["B", "B"] + expected = pd.DataFrame(1, index=["AB", "BC"], columns=["BC", "AB"]) + tm.assert_frame_equal(result, expected) + + def test_regex_empty(self): + idx, cols = self.idx, self.cols + + df = pd.DataFrame(1, index=idx, columns=cols) + ser = df["AB"] + + result = df.loc(regex=True)["X"] + expected = pd.DataFrame(columns=cols, dtype="int64") + tm.assert_frame_equal(result, expected) + + result = ser.loc(regex=True)["X"] + expected = pd.Series(name="AB", dtype="int64") + tm.assert_series_equal(result, expected) + + result = df.loc(regex=True)[:, "X"] + expected = pd.DataFrame(index=idx, dtype="int64") + tm.assert_frame_equal(result, expected) + + result = df.loc(regex=True)["X", "X"] + expected = pd.DataFrame(dtype="int64") + tm.assert_frame_equal(result, expected) + + def test_regex_inserting(self): + idx, cols = self.idx, self.cols + + df = pd.DataFrame(1, index=idx, columns=cols) + + msg = "Inserting with regex has not been implemented" + with pytest.raises(NotImplementedError, match=msg): + df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]] + + class TestLoc(Base): def test_loc_getitem_dups(self): # GH 5678