Skip to content

Commit d82f9dd

Browse files
authored
ENH: Improve performance for arrow dtypes in monotonic join (#51365)
1 parent d2c05c4 commit d82f9dd

File tree

3 files changed

+20
-3
lines changed

3 files changed

+20
-3
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1108,7 +1108,7 @@ Performance improvements
11081108
- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`)
11091109
- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`)
11101110
- Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`)
1111-
- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`)
1111+
- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked and arrow dtypes when :class:`Index` is monotonic (:issue:`50310`, :issue:`51365`)
11121112
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
11131113
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
11141114
- Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`)

pandas/core/indexes/base.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@
144144
validate_putmask,
145145
)
146146
from pandas.core.arrays import (
147+
ArrowExtensionArray,
147148
BaseMaskedArray,
148149
Categorical,
149150
ExtensionArray,
@@ -4850,8 +4851,10 @@ def _can_use_libjoin(self) -> bool:
48504851
if type(self) is Index:
48514852
# excludes EAs, but include masks, we get here with monotonic
48524853
# values only, meaning no NA
4853-
return isinstance(self.dtype, np.dtype) or isinstance(
4854-
self.values, BaseMaskedArray
4854+
return (
4855+
isinstance(self.dtype, np.dtype)
4856+
or isinstance(self.values, BaseMaskedArray)
4857+
or isinstance(self._values, ArrowExtensionArray)
48554858
)
48564859
return not is_interval_dtype(self.dtype)
48574860

@@ -4942,6 +4945,10 @@ def _get_join_target(self) -> ArrayLike:
49424945
if isinstance(self._values, BaseMaskedArray):
49434946
# This is only used if our array is monotonic, so no NAs present
49444947
return self._values._data
4948+
elif isinstance(self._values, ArrowExtensionArray):
4949+
# This is only used if our array is monotonic, so no missing values
4950+
# present
4951+
return self._values.to_numpy()
49454952
return self._get_engine_target()
49464953

49474954
def _from_join_target(self, result: np.ndarray) -> ArrayLike:
@@ -4951,6 +4958,8 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike:
49514958
"""
49524959
if isinstance(self.values, BaseMaskedArray):
49534960
return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
4961+
elif isinstance(self.values, ArrowExtensionArray):
4962+
return type(self.values)._from_sequence(result)
49544963
return result
49554964

49564965
@doc(IndexOpsMixin._memory_usage)

pandas/tests/indexes/test_setops.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,3 +884,11 @@ def test_symmetric_difference_non_index(self, sort):
884884
result = index1.symmetric_difference(index2, result_name="new_name", sort=sort)
885885
assert tm.equalContents(result, expected)
886886
assert result.name == "new_name"
887+
888+
def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype):
889+
# GH#51365
890+
idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype)
891+
idx2 = Index([3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype)
892+
result = idx.union(idx2)
893+
expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype)
894+
tm.assert_index_equal(result, expected)

0 commit comments

Comments
 (0)