diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c987588097953..fb7d3faa21cee 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1087,6 +1087,7 @@ Performance improvements - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) +- Performance improvement for :func:`concat` with misaligned dataframes having a single float dtype (:issue:`50652`) - Performance improvement for :func:`api.types.infer_dtype` (:issue:`51054`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 650d51b896dc5..7c5a1afc83351 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -21,6 +21,7 @@ from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.common import is_float_dtype from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -49,6 +50,7 @@ from pandas._typing import ( Axis, AxisInt, + Dtype, HashableT, ) @@ -563,6 +565,18 @@ def __init__( self.new_axes = self._get_new_axes() + def _maybe_float_dtype(self) -> Dtype | None: + """If all columns in all objs are float only, we may be able to optimize.""" + all_dtypes = [ + blk.dtype + for df in self.objs + for blk in df._mgr.blocks # type: ignore[union-attr] + ] + all_dtypes = [*dict.fromkeys(all_dtypes)] + if len(all_dtypes) != 1: + return None + return all_dtypes[0] if is_float_dtype(all_dtypes[0]) else None + def get_result(self): cons: Callable[..., DataFrame | Series] sample: DataFrame | Series @@ -597,6 +611,7 @@ def get_result(self): # combine block managers else: sample = cast("DataFrame", self.objs[0]) + maybe_float = self._maybe_float_dtype() mgrs_indexers = [] for obj in self.objs: @@ -608,9 +623,17 @@ def get_result(self): continue # 1-ax to convert BlockManager axis to DataFrame axis - obj_labels = obj.axes[1 - ax] - if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.get_indexer(new_labels) + obj_labels = obj.axes[self.bm_axis] + if new_labels.equals(obj_labels): + continue + if maybe_float is not None and obj_labels.is_unique: + # by aligning dataframes to new_labels, we get a perf boost + # only done with frames with all floats ATM + obj = obj.reindex(new_labels, axis=self.bm_axis) + obj = obj.astype(maybe_float) + continue + + indexers[ax] = obj_labels.get_indexer(new_labels) mgrs_indexers.append((obj._mgr, indexers))