Skip to content

Commit d29c853

Browse files
committed
PERF: faster pd.concat when same concat float dtype but misaligned axis
1 parent 58cc356 commit d29c853

File tree

2 files changed

+27
-3
lines changed

2 files changed

+27
-3
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,7 @@ Performance improvements
11181118
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
11191119
- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`)
11201120
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
1121+
- Performance improvement for :func:`concat` with misaligned dataframes having a single float dtype (:issue:`50652`)
11211122
- Performance improvement for :func:`api.types.infer_dtype` (:issue:`51054`)
11221123
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
11231124
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)

pandas/core/reshape/concat.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@
2222
from pandas._typing import (
2323
Axis,
2424
AxisInt,
25+
Dtype,
2526
HashableT,
2627
)
2728
from pandas.util._decorators import cache_readonly
2829

30+
from pandas.core.dtypes.common import is_float_dtype
2931
from pandas.core.dtypes.concat import concat_compat
3032
from pandas.core.dtypes.generic import (
3133
ABCDataFrame,
@@ -564,6 +566,18 @@ def __init__(
564566

565567
self.new_axes = self._get_new_axes()
566568

569+
def _maybe_float_dtype(self) -> Dtype | None:
570+
"""If all columns in all objs are float only, we may be able to optimize."""
571+
all_dtypes = [
572+
blk.dtype
573+
for df in self.objs
574+
for blk in df._mgr.blocks # type: ignore[union-attr]
575+
]
576+
all_dtypes = [*dict.fromkeys(all_dtypes)]
577+
if len(all_dtypes) != 1:
578+
return None
579+
return all_dtypes[0] if is_float_dtype(all_dtypes[0]) else None
580+
567581
def get_result(self):
568582
cons: Callable[..., DataFrame | Series]
569583
sample: DataFrame | Series
@@ -598,6 +612,7 @@ def get_result(self):
598612
# combine block managers
599613
else:
600614
sample = cast("DataFrame", self.objs[0])
615+
maybe_float = self._maybe_float_dtype()
601616

602617
mgrs_indexers = []
603618
for obj in self.objs:
@@ -609,9 +624,17 @@ def get_result(self):
609624
continue
610625

611626
# 1-ax to convert BlockManager axis to DataFrame axis
612-
obj_labels = obj.axes[1 - ax]
613-
if not new_labels.equals(obj_labels):
614-
indexers[ax] = obj_labels.get_indexer(new_labels)
627+
obj_labels = obj.axes[self.bm_axis]
628+
if new_labels.equals(obj_labels):
629+
continue
630+
if maybe_float is not None and obj_labels.is_unique:
631+
# by aligning dataframes to new_labels, we get a perf boost
632+
# only done with frames with all floats ATM
633+
obj = obj.reindex(new_labels, axis=self.bm_axis)
634+
obj = obj.astype(maybe_float)
635+
continue
636+
637+
indexers[ax] = obj_labels.get_indexer(new_labels)
615638

616639
mgrs_indexers.append((obj._mgr, indexers))
617640

0 commit comments

Comments
 (0)