Skip to content

Commit ee69f90

Browse files
committed
PERF: faster pd.concat when same concat float dtype but misaligned axis
1 parent 5b9f980 commit ee69f90

File tree

2 files changed

+27
-3
lines changed

2 files changed

+27
-3
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,7 @@ Performance improvements
10871087
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
10881088
- Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`)
10891089
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
1090+
- Performance improvement for :func:`concat` with misaligned dataframes having a single float dtype (:issue:`50652`)
10901091
- Performance improvement for :func:`api.types.infer_dtype` (:issue:`51054`)
10911092
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
10921093
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)

pandas/core/reshape/concat.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from pandas.util._decorators import cache_readonly
2323

24+
from pandas.core.dtypes.common import is_float_dtype
2425
from pandas.core.dtypes.concat import concat_compat
2526
from pandas.core.dtypes.generic import (
2627
ABCDataFrame,
@@ -49,6 +50,7 @@
4950
from pandas._typing import (
5051
Axis,
5152
AxisInt,
53+
Dtype,
5254
HashableT,
5355
)
5456

@@ -563,6 +565,18 @@ def __init__(
563565

564566
self.new_axes = self._get_new_axes()
565567

568+
def _maybe_float_dtype(self) -> Dtype | None:
569+
"""If all columns in all objs are float only, we may be able to optimize."""
570+
all_dtypes = [
571+
blk.dtype
572+
for df in self.objs
573+
for blk in df._mgr.blocks # type: ignore[union-attr]
574+
]
575+
all_dtypes = [*dict.fromkeys(all_dtypes)]
576+
if len(all_dtypes) != 1:
577+
return None
578+
return all_dtypes[0] if is_float_dtype(all_dtypes[0]) else None
579+
566580
def get_result(self):
567581
cons: Callable[..., DataFrame | Series]
568582
sample: DataFrame | Series
@@ -597,6 +611,7 @@ def get_result(self):
597611
# combine block managers
598612
else:
599613
sample = cast("DataFrame", self.objs[0])
614+
maybe_float = self._maybe_float_dtype()
600615

601616
mgrs_indexers = []
602617
for obj in self.objs:
@@ -608,9 +623,17 @@ def get_result(self):
608623
continue
609624

610625
# 1-ax to convert BlockManager axis to DataFrame axis
611-
obj_labels = obj.axes[1 - ax]
612-
if not new_labels.equals(obj_labels):
613-
indexers[ax] = obj_labels.get_indexer(new_labels)
626+
obj_labels = obj.axes[self.bm_axis]
627+
if new_labels.equals(obj_labels):
628+
continue
629+
if maybe_float is not None and obj_labels.is_unique:
630+
# by aligning dataframes to new_labels, we get a perf boost
631+
# only done with frames with all floats ATM
632+
obj = obj.reindex(new_labels, axis=self.bm_axis)
633+
obj = obj.astype(maybe_float)
634+
continue
635+
636+
indexers[ax] = obj_labels.get_indexer(new_labels)
614637

615638
mgrs_indexers.append((obj._mgr, indexers))
616639

0 commit comments

Comments
 (0)