diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..08ba7d80f6880 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1089,7 +1089,7 @@ Performance improvements - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) - Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) -- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`) +- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked and arrow dtypes when :class:`Index` is monotonic (:issue:`50310`, :issue:`51365`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 363bfe76d40fb..9d4a4ca8a5140 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -145,6 +145,7 @@ validate_putmask, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, Categorical, ExtensionArray, @@ -4847,8 +4848,10 @@ def _can_use_libjoin(self) -> bool: if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA - return isinstance(self.dtype, np.dtype) or isinstance( - self.values, BaseMaskedArray + return ( + isinstance(self.dtype, np.dtype) + or isinstance(self.values, BaseMaskedArray) + or isinstance(self._values, ArrowExtensionArray) ) return not is_interval_dtype(self.dtype) @@ -4939,6 +4942,10 @@ def _get_join_target(self) -> ArrayLike: if isinstance(self._values, BaseMaskedArray): # This is only used if our array is monotonic, so no NAs present return self._values._data + elif isinstance(self._values, ArrowExtensionArray): + # This is only used if our array is monotonic, so no missing values + # present + return self._values.to_numpy() return self._get_engine_target() def _from_join_target(self, result: np.ndarray) -> ArrayLike: @@ -4948,6 +4955,8 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) + elif isinstance(self.values, ArrowExtensionArray): + return type(self.values)._from_sequence(result) return result @doc(IndexOpsMixin._memory_usage) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 001efe07b5d2b..708de02518b73 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -886,3 +886,11 @@ def test_symmetric_difference_non_index(self, sort): result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) assert tm.equalContents(result, expected) assert result.name == "new_name" + + def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): + # GH#51365 + idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype) + idx2 = Index([3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) + result = idx.union(idx2) + expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) + tm.assert_index_equal(result, expected)