Skip to content

Commit c1247a7

Browse files
meeseeksmachinejorisvandenbosschemroeschke
authored
Backport PR #53231 on branch 2.0.x (PERF: fix merging on datetimelike columns to not use object-dtype factorizer) (#53471)
* Backport PR #53231: PERF: fix merging on datetimelike columns to not use object-dtype factorizer * Update pandas/core/reshape/merge.py * Check npdtype --------- Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
1 parent 54e7fe9 commit c1247a7

File tree

3 files changed

+48
-1
lines changed

3 files changed

+48
-1
lines changed

asv_bench/benchmarks/join_merge.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,38 @@ def time_i8merge(self, how):
317317
merge(self.left, self.right, how=how)
318318

319319

320+
class MergeDatetime:
321+
params = [
322+
[
323+
("ns", "ns"),
324+
("ms", "ms"),
325+
("ns", "ms"),
326+
],
327+
[None, "Europe/Brussels"],
328+
]
329+
param_names = ["units", "tz"]
330+
331+
def setup(self, units, tz):
332+
unit_left, unit_right = units
333+
N = 10_000
334+
keys = Series(date_range("2012-01-01", freq="T", periods=N, tz=tz))
335+
self.left = DataFrame(
336+
{
337+
"key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left),
338+
"value1": np.random.randn(N * 10),
339+
}
340+
)
341+
self.right = DataFrame(
342+
{
343+
"key": keys[:8000].dt.as_unit(unit_right),
344+
"value2": np.random.randn(8000),
345+
}
346+
)
347+
348+
def time_merge(self, units, tz):
349+
merge(self.left, self.right)
350+
351+
320352
class MergeCategoricals:
321353
def setup(self):
322354
self.left_object = DataFrame(

doc/source/whatsnew/v2.0.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ including other versions of pandas.
1313

1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
16+
- Fixed performance regression in merging on datetime-like columns (:issue:`53231`)
1617
-
1718

1819
.. ---------------------------------------------------------------------------

pandas/core/reshape/merge.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2370,7 +2370,14 @@ def _factorize_keys(
23702370
rk = extract_array(rk, extract_numpy=True, extract_range=True)
23712371
# TODO: if either is a RangeIndex, we can likely factorize more efficiently?
23722372

2373-
if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype):
2373+
if (
2374+
isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
2375+
) or (
2376+
isinstance(lk.dtype, np.dtype)
2377+
and lk.dtype.kind == "M"
2378+
and isinstance(rk.dtype, np.dtype)
2379+
and rk.dtype.kind == "M"
2380+
):
23742381
# Extract the ndarray (UTC-localized) values
23752382
# Note: we dont need the dtypes to match, as these can still be compared
23762383
lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
@@ -2403,6 +2410,13 @@ def _factorize_keys(
24032410
# "_values_for_factorize"
24042411
rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
24052412

2413+
if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
2414+
# GH#23917 TODO: Needs tests for non-matching dtypes
2415+
# GH#23917 TODO: needs tests for case where lk is integer-dtype
2416+
# and rk is datetime-dtype
2417+
lk = np.asarray(lk, dtype=np.int64)
2418+
rk = np.asarray(rk, dtype=np.int64)
2419+
24062420
klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
24072421

24082422
rizer = klass(max(len(lk), len(rk)))

0 commit comments

Comments
 (0)