From e1f43142c108342b44ef117d45615bd6f7feb365 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 4 Apr 2023 22:20:37 +0200 Subject: [PATCH 01/10] BUG: merge with arrow and numpy dtypes raises --- pandas/core/reshape/merge.py | 10 ++++++++-- pandas/tests/reshape/merge/test_merge.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bfaf403491801..5a474eb003933 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -83,7 +83,7 @@ Categorical, Index, MultiIndex, - Series, + Series, ArrowDtype, ) import pandas.core.algorithms as algos from pandas.core.arrays import ( @@ -2377,7 +2377,7 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): - if not isinstance(lk, BaseMaskedArray): + if not isinstance(lk, BaseMaskedArray) and not isinstance(lk.dtype, ArrowDtype): lk, _ = lk._values_for_factorize() # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute @@ -2392,6 +2392,10 @@ def _factorize_keys( assert isinstance(rk, BaseMaskedArray) llab = rizer.factorize(lk._data, mask=lk._mask) rlab = rizer.factorize(rk._data, mask=rk._mask) + elif isinstance(lk.dtype, ArrowDtype): + # we can only get here with numeric dtypes + llab = rizer.factorize(lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()) + rlab = rizer.factorize(rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()) else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], @@ -2450,6 +2454,8 @@ def _convert_arrays_and_get_rizer_klass( # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; # expected type "Type[object]" klass = _factorizers[lk.dtype.type] # type: ignore[index] + elif isinstance(lk.dtype, ArrowDtype): + klass = _factorizers[lk.dtype.numpy_dtype.type] else: klass = _factorizers[lk.dtype.type] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 6f2b327c37067..d09ce0fd88e8b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2761,3 +2761,16 @@ def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type): } ) tm.assert_frame_equal(result, expected) + + +def test_merge_arrow_and_numpy_dtypes(): + # GH#52406 + df = DataFrame({"a": [1, 2]}, dtype="int64") + df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]") + result = df.merge(df2) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + result = df2.merge(df) + expected = df2.copy() + tm.assert_frame_equal(result, expected) From 93a448613f586fa459e1b503852748868b4f5818 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 4 Apr 2023 22:22:56 +0200 Subject: [PATCH 02/10] BUG: merge with arrow and numpy dtypes raises --- pandas/core/reshape/merge.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5a474eb003933..54791adfea250 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -80,10 +80,11 @@ ) from pandas import ( + ArrowDtype, Categorical, Index, MultiIndex, - Series, ArrowDtype, + Series, ) import pandas.core.algorithms as algos from pandas.core.arrays import ( @@ -2394,8 +2395,12 @@ def _factorize_keys( rlab = rizer.factorize(rk._data, mask=rk._mask) elif isinstance(lk.dtype, ArrowDtype): # we can only get here with numeric dtypes - llab = rizer.factorize(lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()) - rlab = rizer.factorize(rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()) + llab = rizer.factorize( + lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() + ) + rlab = rizer.factorize( + rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() + ) else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], From 4e27a4fb3a2276c0d89c993aacb075e639402a25 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 4 Apr 2023 22:24:29 +0200 Subject: [PATCH 03/10] BUG: merge with arrow and numpy dtypes raises --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 0122c84ba2a8e..4fda2cd11ce12 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,7 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: From 4433bfc4c4c153e18af0ecf6218e36ddc77bfc87 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 4 Apr 2023 22:26:19 +0200 Subject: [PATCH 04/10] BUG: merge with arrow and numpy dtypes raises --- pandas/core/reshape/merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 54791adfea250..146b8e7694c81 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2395,6 +2395,7 @@ def _factorize_keys( rlab = rizer.factorize(rk._data, mask=rk._mask) elif isinstance(lk.dtype, ArrowDtype): # we can only get here with numeric dtypes + # TODO: Remove when we have a Factorizer for Arrow llab = rizer.factorize( lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() ) From ef90b455d4e2aa22466a3fc06b56a01e44193160 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 4 Apr 2023 23:00:22 +0200 Subject: [PATCH 05/10] Fix conversion --- pandas/core/reshape/merge.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 146b8e7694c81..090c6db7568d3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2378,7 +2378,10 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): - if not isinstance(lk, BaseMaskedArray) and not isinstance(lk.dtype, ArrowDtype): + if not isinstance(lk, BaseMaskedArray) and not ( + isinstance(lk.dtype, ArrowDtype) + and is_numeric_dtype(lk.dtype.numpy_dtype.type) + ): lk, _ = lk._values_for_factorize() # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute From d34d28ffa831999de918c0090432a2c39a476c79 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Apr 2023 00:07:08 +0200 Subject: [PATCH 06/10] Fix mypy and add test --- pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/merge/test_merge.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 090c6db7568d3..ff85e1eb35319 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -88,6 +88,7 @@ ) import pandas.core.algorithms as algos from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, ExtensionArray, ) @@ -2396,7 +2397,7 @@ def _factorize_keys( assert isinstance(rk, BaseMaskedArray) llab = rizer.factorize(lk._data, mask=lk._mask) rlab = rizer.factorize(rk._data, mask=rk._mask) - elif isinstance(lk.dtype, ArrowDtype): + elif isinstance(lk, ArrowExtensionArray) or isinstance(rk, ArrowExtensionArray): # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow llab = rizer.factorize( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d09ce0fd88e8b..f2bf63fbaed45 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 + from pandas.core.dtypes.common import ( is_categorical_dtype, is_object_dtype, @@ -2763,9 +2765,11 @@ def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type): tm.assert_frame_equal(result, expected) -def test_merge_arrow_and_numpy_dtypes(): +@pytest.mark.skipif(pa_version_under7p0, reason="need pyarrow") +@pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"]) +def test_merge_arrow_and_numpy_dtypes(dtype): # GH#52406 - df = DataFrame({"a": [1, 2]}, dtype="int64") + df = DataFrame({"a": [1, 2]}, dtype=dtype) df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]") result = df.merge(df2) expected = df.copy() From 88b4b88bb9bf7678ef5441594eb47e6f3861e6d9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 6 Apr 2023 00:34:43 +0200 Subject: [PATCH 07/10] Update pandas/core/reshape/merge.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ff85e1eb35319..975924e21967f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2381,7 +2381,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): if not isinstance(lk, BaseMaskedArray) and not ( isinstance(lk.dtype, ArrowDtype) - and is_numeric_dtype(lk.dtype.numpy_dtype.type) + and lk.dtype._is_numeric ): lk, _ = lk._values_for_factorize() From cfbc1648110cc575a1e78dc8af6afe782db35f0b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Apr 2023 00:57:07 +0200 Subject: [PATCH 08/10] Fix --- pandas/core/reshape/merge.py | 3 +-- pandas/tests/reshape/merge/test_merge.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 975924e21967f..b8f8d6b3ef160 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2380,8 +2380,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): if not isinstance(lk, BaseMaskedArray) and not ( - isinstance(lk.dtype, ArrowDtype) - and lk.dtype._is_numeric + isinstance(lk.dtype, ArrowDtype) and lk.dtype._is_numeric ): lk, _ = lk._values_for_factorize() diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f2bf63fbaed45..3a822c8134eb4 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 - from pandas.core.dtypes.common import ( is_categorical_dtype, is_object_dtype, @@ -2765,10 +2763,10 @@ def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type): tm.assert_frame_equal(result, expected) -@pytest.mark.skipif(pa_version_under7p0, reason="need pyarrow") @pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"]) def test_merge_arrow_and_numpy_dtypes(dtype): # GH#52406 + pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2]}, dtype=dtype) df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]") result = df.merge(df2) From 88201e737624d6cd10cb9260e2ccf96e1d7142d7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Apr 2023 01:07:46 +0200 Subject: [PATCH 09/10] Fix --- pandas/core/reshape/merge.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b8f8d6b3ef160..4c3665ee99393 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2380,7 +2380,9 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): if not isinstance(lk, BaseMaskedArray) and not ( - isinstance(lk.dtype, ArrowDtype) and lk.dtype._is_numeric + # exclude arrow dtypes that would get casted to object + isinstance(lk.dtype, ArrowDtype) + and is_numeric_dtype(lk.dtype.numpy_dtype) ): lk, _ = lk._values_for_factorize() @@ -2396,7 +2398,8 @@ def _factorize_keys( assert isinstance(rk, BaseMaskedArray) llab = rizer.factorize(lk._data, mask=lk._mask) rlab = rizer.factorize(rk._data, mask=rk._mask) - elif isinstance(lk, ArrowExtensionArray) or isinstance(rk, ArrowExtensionArray): + elif isinstance(lk, ArrowExtensionArray): + assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow llab = rizer.factorize( From 0b8c7c3692a690ae4c4623905694ebba7662eb8b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 6 Apr 2023 01:07:51 +0200 Subject: [PATCH 10/10] Fix --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4c3665ee99393..6b8e72c503de3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2380,7 +2380,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): if not isinstance(lk, BaseMaskedArray) and not ( - # exclude arrow dtypes that would get casted to object + # exclude arrow dtypes that would get cast to object isinstance(lk.dtype, ArrowDtype) and is_numeric_dtype(lk.dtype.numpy_dtype) ):