From e75f87072daea55994077d8340503ef11a52225d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Aug 2022 18:52:17 -0700 Subject: [PATCH 1/3] REF: avoid internals in merge code --- pandas/core/reshape/merge.py | 67 +++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 659cb1d2f6838..0720f836d184f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -81,7 +81,6 @@ import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc -from pandas.core.internals import concatenate_managers from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -704,28 +703,50 @@ def __init__( if validate is not None: self._validate(validate) - def get_result(self) -> DataFrame: - if self.indicator: - self.left, self.right = self._indicator_pre_merge(self.left, self.right) - - join_index, left_indexer, right_indexer = self._get_join_info() + def _reindex_and_concat( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> DataFrame: + """ + reindex along index and concat along columns. + """ + # Take views so we do not alter the originals + left = self.left[:] + right = self.right[:] llabels, rlabels = _items_overlap_with_suffix( self.left._info_axis, self.right._info_axis, self.suffixes ) - lindexers = {1: left_indexer} if left_indexer is not None else {} - rindexers = {1: right_indexer} if right_indexer is not None else {} + if left_indexer: + # Pinning the index here (and in the right code just below) is not + # necessary, but makes the `.take` more performant if we have e.g. + # a MultiIndex for left.index. + left.index = range(len(left)) + left = left._take(left_indexer, axis=0, convert_indices=False) + left.index = join_index - result_data = concatenate_managers( - [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], - axes=[llabels.append(rlabels), join_index], - concat_axis=0, - copy=self.copy, - ) + if right_indexer: + right.index = range(len(right)) + right = right._take(right_indexer, axis=0, convert_indices=False) + right.index = join_index - typ = self.left._constructor - result = typ(result_data).__finalize__(self, method=self._merge_type) + from pandas import concat + + result = concat([left, right], axis=1, copy=self.copy) + result.columns = llabels.append(rlabels) + return result + + def get_result(self) -> DataFrame: + if self.indicator: + self.left, self.right = self._indicator_pre_merge(self.left, self.right) + + join_index, left_indexer, right_indexer = self._get_join_info() + + result = self._reindex_and_concat(join_index, left_indexer, right_indexer) + result = result.__finalize__(self, method=self._merge_type) if self.indicator: result = self._indicator_post_merge(result) @@ -1682,19 +1703,9 @@ def get_result(self) -> DataFrame: left_join_indexer = left_indexer right_join_indexer = right_indexer - lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} - rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} - - result_data = concatenate_managers( - [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], - axes=[llabels.append(rlabels), join_index], - concat_axis=0, - copy=self.copy, + result = self._reindex_and_concat( + join_index, left_join_indexer, right_join_indexer ) - - typ = self.left._constructor - result = typ(result_data) - self._maybe_add_join_keys(result, left_indexer, right_indexer) return result From f2b219b270565ec516d041467418c8a64b5d8de0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 Aug 2022 07:46:38 -0700 Subject: [PATCH 2/3] fix condition --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0720f836d184f..6e2ecaa860e05 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -720,7 +720,7 @@ def _reindex_and_concat( self.left._info_axis, self.right._info_axis, self.suffixes ) - if left_indexer: + if left_indexer is not None: # Pinning the index here (and in the right code just below) is not # necessary, but makes the `.take` more performant if we have e.g. # a MultiIndex for left.index. @@ -728,7 +728,7 @@ def _reindex_and_concat( left = left._take(left_indexer, axis=0, convert_indices=False) left.index = join_index - if right_indexer: + if right_indexer is not None: right.index = range(len(right)) right = right._take(right_indexer, axis=0, convert_indices=False) right.index = join_index From a6d18471fd9293eaeab277f96238f59f970fa97b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 Aug 2022 15:38:40 -0700 Subject: [PATCH 3/3] use reindex_indexer --- pandas/core/reshape/merge.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6e2ecaa860e05..26833e15f057c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -724,13 +724,29 @@ def _reindex_and_concat( # Pinning the index here (and in the right code just below) is not # necessary, but makes the `.take` more performant if we have e.g. # a MultiIndex for left.index. - left.index = range(len(left)) - left = left._take(left_indexer, axis=0, convert_indices=False) + lmgr = left._mgr.reindex_indexer( + join_index, + left_indexer, + axis=1, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + left = left._constructor(lmgr) left.index = join_index if right_indexer is not None: - right.index = range(len(right)) - right = right._take(right_indexer, axis=0, convert_indices=False) + rmgr = right._mgr.reindex_indexer( + join_index, + right_indexer, + axis=1, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + right = right._constructor(rmgr) right.index = join_index from pandas import concat