diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 81f7441846589..6d5d3c1689595 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1462,15 +1462,57 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): matches the regular expression. extract : Extract matched groups. - Examples - -------- - >>> ser = pd.Series(["cat", "duck", "dove"]) - >>> ser.str.fullmatch(r"d.+") - 0 False - 1 True - 2 True - dtype: bool + Notes + ----- + This method enforces consistent behavior between Python's string dtype + and PyArrow-backed string arrays when using regular expressions + containing alternation (|). For regex patterns with alternation operators, + the method ensures proper grouping by wrapping the pattern in parentheses + when using PyArrow-backed string arrays. """ + is_pyarrow = False + arr = self._data.array + arr_type = type(arr).__name__ + is_pyarrow = arr_type == "ArrowStringArray" + if not is_pyarrow: + is_pyarrow = "Arrow" in arr_type + if not is_pyarrow and hasattr(arr, "dtype"): + dtype_str = str(arr.dtype) + is_pyarrow = ( + "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower() + ) + if is_pyarrow and "|" in pat: + + def _is_fully_wrapped(pattern): + if not (pattern.startswith("(") and pattern.endswith(")")): + return False + inner = pattern[1:-1] + level = 0 + escape = False + in_char_class = False + for char in inner: + if escape: + escape = False + continue + if char == "\\": + escape = True + elif not in_char_class and char == "[": + in_char_class = True + elif in_char_class and char == "]": + in_char_class = False + elif not in_char_class: + if char == "(": + level += 1 + elif char == ")": + if level == 0: + return False + level -= 1 + return level == 0 + + if not ( + pat.startswith("(") and pat.endswith(")") and _is_fully_wrapped(pat) + ): + pat = f"({pat})" result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) diff --git a/pandas/tests/strings/test_pyarrow_format_behavior.py b/pandas/tests/strings/test_pyarrow_format_behavior.py new file mode 100644 index 0000000000000..f836763746a00 --- /dev/null +++ b/pandas/tests/strings/test_pyarrow_format_behavior.py @@ -0,0 +1,36 @@ +import pytest + +from pandas import Series + + +@pytest.mark.parametrize("dtype", [str]) +def test_string_array(dtype): + test_series = Series(["asdf", "as"], dtype=dtype) + regex = r"((as)|(as))" + regex2 = r"(as)|(as)" + assert list(test_series.str.fullmatch(regex)) == [False, True] + assert list(test_series.str.fullmatch(regex2)) == [False, True] + + +@pytest.mark.parametrize( + "data, pattern, expected", + [ + (["cat", "duck", "dove"], r"d.+", [False, True, True]), + ], +) +def test_string_match(data, pattern, expected): + ser = Series(data) + assert list(ser.str.fullmatch(pattern)) == expected + + +@pytest.mark.parametrize("dtype", [str]) +@pytest.mark.parametrize( + "pattern, expected", + [ + (r"(foo)|((as)(df)?)", [True, True, True]), + ("foo|as", [False, True, True]), + ], +) +def test_string_alternation_patterns(dtype, pattern, expected): + ser = Series(["asdf", "foo", "as"], dtype=dtype) + assert list(ser.str.fullmatch(pattern)) == expected