From c2b768b490a0cd706711ab2f6ffbaa171f505594 Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Tue, 25 Mar 2025 19:57:22 +0000 Subject: [PATCH 1/6] Fix #61072: inconsistent fullmatch results with regex alternation in PyArrow strings Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings --- pandas/core/strings/accessor.py | 59 +++++++++++++++++-- .../strings/test_pyarrow_format_behavior.py | 31 ++++++++++ 2 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/strings/test_pyarrow_format_behavior.py diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 81f7441846589..36e8ae994f681 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1461,16 +1461,67 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): match : Similar, but also returns `True` when only a *prefix* of the string matches the regular expression. extract : Extract matched groups. - + + Notes + ----- + This method enforces consistent behavior between Python's string dtype + and PyArrow-backed string arrays when using regular expressions + containing alternation (|). For regex patterns with alternation operators, + the method ensures proper grouping by wrapping the pattern in parentheses + when using PyArrow-backed string arrays. Examples -------- >>> ser = pd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r"d.+") - 0 False - 1 True - 2 True + 0 False + 1 True + 2 True + dtype: bool + Ensure consistent behavior with alternation patterns: + >>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]') + >>> ser.str.fullmatch(r"(as)|(as)") + 0 False + 1 True dtype: bool """ + is_pyarrow = False + arr = self._data.array + arr_type = type(arr).__name__ + is_pyarrow = arr_type == "ArrowStringArray" + if not is_pyarrow: + is_pyarrow = "Arrow" in arr_type + if not is_pyarrow and hasattr(arr, "dtype"): + dtype_str = str(arr.dtype) + is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower() + if is_pyarrow and "|" in pat: + def _is_fully_wrapped(pattern): + if not (pattern.startswith('(') and pattern.endswith(')')): + return False + inner = pattern[1:-1] + level = 0 + escape = False + in_char_class = False + for char in inner: + if escape: + escape = False + continue + if char == '\\': + escape = True + elif not in_char_class and char == '[': + in_char_class = True + elif in_char_class and char == ']': + in_char_class = False + elif not in_char_class: + if char == '(': + level += 1 + elif char == ')': + if level == 0: + return False + level -= 1 + return level == 0 + if not (pat.startswith('(') and pat.endswith(')') and + _is_fully_wrapped(pat)): + pat = f"({pat})" result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) diff --git a/pandas/tests/strings/test_pyarrow_format_behavior.py b/pandas/tests/strings/test_pyarrow_format_behavior.py new file mode 100644 index 0000000000000..8399326172aa4 --- /dev/null +++ b/pandas/tests/strings/test_pyarrow_format_behavior.py @@ -0,0 +1,31 @@ +import pytest +from pandas import ( + Series, +) +@pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) +def test_string_array(dtype): + test_series = Series(['asdf', 'as'], dtype=dtype) + regex = r'((as)|(as))' + regex2 = r'(as)|(as)' + assert list(test_series.str.fullmatch(regex)) == [False, True] + assert list(test_series.str.fullmatch(regex2)) == [False, True] +@pytest.mark.parametrize( + "data, pattern, expected", + [ + (["cat", "duck", "dove"], r"d.+", [False, True, True]), + ], +) +def test_string_match(data, pattern, expected): + ser = Series(data) + assert list(ser.str.fullmatch(pattern)) == expected +@pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) +@pytest.mark.parametrize( + "pattern, expected", + [ + (r'(foo)|((as)(df)?)', [True, True, True]), + ('foo|as', [False, True, True]), + ], +) +def test_string_alternation_patterns(dtype, pattern, expected): + ser = Series(['asdf', 'foo', 'as'], dtype=dtype) + assert list(ser.str.fullmatch(pattern)) == expected \ No newline at end of file From 9b917a297cb67beec34523fb173c6a51efbea253 Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Tue, 25 Mar 2025 19:57:22 +0000 Subject: [PATCH 2/6] Fix #61072: Fix regex grouping for PyArrow fullmatch tests --- pandas/core/strings/accessor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 36e8ae994f681..45cac8ec4077d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1461,7 +1461,7 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): match : Similar, but also returns `True` when only a *prefix* of the string matches the regular expression. extract : Extract matched groups. - + Notes ----- This method enforces consistent behavior between Python's string dtype @@ -1469,6 +1469,7 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): containing alternation (|). For regex patterns with alternation operators, the method ensures proper grouping by wrapping the pattern in parentheses when using PyArrow-backed string arrays. + Examples -------- >>> ser = pd.Series(["cat", "duck", "dove"]) @@ -1477,8 +1478,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): 1 True 2 True dtype: bool + Ensure consistent behavior with alternation patterns: - >>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]') + >>> ser = pd.Series(["asdf", "as"], dtype="string[pyarrow]") >>> ser.str.fullmatch(r"(as)|(as)") 0 False 1 True From e666916b2b31f362955c3dbe7911a93ccef21e1f Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Tue, 29 Apr 2025 22:58:05 +0100 Subject: [PATCH 3/6] Update boolean dtype representation and improve regex handling in StringMethods --- pandas/core/strings/accessor.py | 31 +++++++++++-------- .../strings/test_pyarrow_format_behavior.py | 25 +++++++++------ 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 45cac8ec4077d..a50df3d59977e 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1477,14 +1477,14 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): 0 False 1 True 2 True - dtype: bool + dtype: boolean Ensure consistent behavior with alternation patterns: >>> ser = pd.Series(["asdf", "as"], dtype="string[pyarrow]") >>> ser.str.fullmatch(r"(as)|(as)") 0 False 1 True - dtype: bool + dtype: boolean """ is_pyarrow = False arr = self._data.array @@ -1494,11 +1494,14 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): is_pyarrow = "Arrow" in arr_type if not is_pyarrow and hasattr(arr, "dtype"): dtype_str = str(arr.dtype) - is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower() + is_pyarrow = ( + "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower() + ) if is_pyarrow and "|" in pat: + def _is_fully_wrapped(pattern): - if not (pattern.startswith('(') and pattern.endswith(')')): - return False + if not (pattern.startswith("(") and pattern.endswith(")")): + return False inner = pattern[1:-1] level = 0 escape = False @@ -1506,23 +1509,25 @@ def _is_fully_wrapped(pattern): for char in inner: if escape: escape = False - continue - if char == '\\': + continue + if char == "\\": escape = True - elif not in_char_class and char == '[': + elif not in_char_class and char == "[": in_char_class = True - elif in_char_class and char == ']': + elif in_char_class and char == "]": in_char_class = False elif not in_char_class: - if char == '(': + if char == "(": level += 1 - elif char == ')': + elif char == ")": if level == 0: return False level -= 1 return level == 0 - if not (pat.startswith('(') and pat.endswith(')') and - _is_fully_wrapped(pat)): + + if not ( + pat.startswith("(") and pat.endswith(")") and _is_fully_wrapped(pat) + ): pat = f"({pat})" result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) diff --git a/pandas/tests/strings/test_pyarrow_format_behavior.py b/pandas/tests/strings/test_pyarrow_format_behavior.py index 8399326172aa4..b372c654b9e89 100644 --- a/pandas/tests/strings/test_pyarrow_format_behavior.py +++ b/pandas/tests/strings/test_pyarrow_format_behavior.py @@ -1,14 +1,17 @@ import pytest -from pandas import ( - Series, -) + +from pandas import Series + + @pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) def test_string_array(dtype): - test_series = Series(['asdf', 'as'], dtype=dtype) - regex = r'((as)|(as))' - regex2 = r'(as)|(as)' + test_series = Series(["asdf", "as"], dtype=dtype) + regex = r"((as)|(as))" + regex2 = r"(as)|(as)" assert list(test_series.str.fullmatch(regex)) == [False, True] assert list(test_series.str.fullmatch(regex2)) == [False, True] + + @pytest.mark.parametrize( "data, pattern, expected", [ @@ -18,14 +21,16 @@ def test_string_array(dtype): def test_string_match(data, pattern, expected): ser = Series(data) assert list(ser.str.fullmatch(pattern)) == expected + + @pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) @pytest.mark.parametrize( "pattern, expected", [ - (r'(foo)|((as)(df)?)', [True, True, True]), - ('foo|as', [False, True, True]), + (r"(foo)|((as)(df)?)", [True, True, True]), + ("foo|as", [False, True, True]), ], ) def test_string_alternation_patterns(dtype, pattern, expected): - ser = Series(['asdf', 'foo', 'as'], dtype=dtype) - assert list(ser.str.fullmatch(pattern)) == expected \ No newline at end of file + ser = Series(["asdf", "foo", "as"], dtype=dtype) + assert list(ser.str.fullmatch(pattern)) == expected From f5d3b9b048fb97acaada6a5de3d1b48f4acbe011 Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Wed, 30 Apr 2025 10:32:53 +0100 Subject: [PATCH 4/6] Changed return type from bollean to bool --- pandas/core/strings/accessor.py | 4 ++-- pandas/tests/strings/test_pyarrow_format_behavior.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a50df3d59977e..c5e8d0231bb97 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1477,14 +1477,14 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): 0 False 1 True 2 True - dtype: boolean + dtype: bool Ensure consistent behavior with alternation patterns: >>> ser = pd.Series(["asdf", "as"], dtype="string[pyarrow]") >>> ser.str.fullmatch(r"(as)|(as)") 0 False 1 True - dtype: boolean + dtype: bool """ is_pyarrow = False arr = self._data.array diff --git a/pandas/tests/strings/test_pyarrow_format_behavior.py b/pandas/tests/strings/test_pyarrow_format_behavior.py index b372c654b9e89..f836763746a00 100644 --- a/pandas/tests/strings/test_pyarrow_format_behavior.py +++ b/pandas/tests/strings/test_pyarrow_format_behavior.py @@ -3,7 +3,7 @@ from pandas import Series -@pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) +@pytest.mark.parametrize("dtype", [str]) def test_string_array(dtype): test_series = Series(["asdf", "as"], dtype=dtype) regex = r"((as)|(as))" @@ -23,7 +23,7 @@ def test_string_match(data, pattern, expected): assert list(ser.str.fullmatch(pattern)) == expected -@pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) +@pytest.mark.parametrize("dtype", [str]) @pytest.mark.parametrize( "pattern, expected", [ From 8477264b5f22b55238ab55b988012b587a387210 Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Tue, 6 May 2025 12:00:26 +0100 Subject: [PATCH 5/6] Updated errors --- pandas/core/strings/accessor.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c5e8d0231bb97..1757cd1421fee 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1477,14 +1477,6 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): 0 False 1 True 2 True - dtype: bool - - Ensure consistent behavior with alternation patterns: - >>> ser = pd.Series(["asdf", "as"], dtype="string[pyarrow]") - >>> ser.str.fullmatch(r"(as)|(as)") - 0 False - 1 True - dtype: bool """ is_pyarrow = False arr = self._data.array From 75de7653001a832397155ee354045f75d64b9f94 Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Tue, 6 May 2025 13:14:30 +0100 Subject: [PATCH 6/6] Updated errors --- pandas/core/strings/accessor.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1757cd1421fee..6d5d3c1689595 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1469,14 +1469,6 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): containing alternation (|). For regex patterns with alternation operators, the method ensures proper grouping by wrapping the pattern in parentheses when using PyArrow-backed string arrays. - - Examples - -------- - >>> ser = pd.Series(["cat", "duck", "dove"]) - >>> ser.str.fullmatch(r"d.+") - 0 False - 1 True - 2 True """ is_pyarrow = False arr = self._data.array