Skip to content

Fix TypeError when merging categorical dates #16986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ Sparse
Reshaping
^^^^^^^^^
- Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`)
- Merging with categorical date columns raised a TypeError (:issue:`16900`)
- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`)
- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`)

Expand Down
25 changes: 18 additions & 7 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,7 @@ def _get_merge_keys(self):
return left_keys, right_keys, join_names

def _maybe_coerce_merge_keys(self):
# we have valid mergee's but we may have to further
# we have valid mergees but we may have to further
# coerce these if they are originally incompatible types
#
# for example if these are categorical, but are not dtype_equal
Expand All @@ -889,12 +889,16 @@ def _maybe_coerce_merge_keys(self):
if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
continue

lk_is_cat = is_categorical_dtype(lk)
rk_is_cat = is_categorical_dtype(rk)

# if either left or right is a categorical
# then the must match exactly in categories & ordered
if is_categorical_dtype(lk) and is_categorical_dtype(rk):
if lk_is_cat and rk_is_cat:
if lk.is_dtype_equal(rk):
continue
elif is_categorical_dtype(lk) or is_categorical_dtype(rk):

elif lk_is_cat or rk_is_cat:
pass

elif is_dtype_equal(lk.dtype, rk.dtype):
Expand All @@ -904,7 +908,7 @@ def _maybe_coerce_merge_keys(self):
# kinds to proceed, eg. int64 and int8
# further if we are object, but we infer to
# the same, then proceed
if (is_numeric_dtype(lk) and is_numeric_dtype(rk)):
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
continue

Expand All @@ -913,13 +917,20 @@ def _maybe_coerce_merge_keys(self):
continue

# Houston, we have a problem!
# let's coerce to object
# let's coerce to object if the dtypes aren't
# categorical, otherwise coerce to the category
# dtype. If we coerced categories to object,
# then we would lose type information on some
# columns, and end up trying to merge
# incompatible dtypes. See GH 16900.
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(object)})
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(object)})
**{name: self.right[name].astype(typ)})

def _validate_specification(self):
# Hm, any way to make this logic less complicated??
Expand Down
36 changes: 35 additions & 1 deletion pandas/tests/reshape/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# pylint: disable=E1103

import pytest
from datetime import datetime
from datetime import datetime, date
from numpy.random import randn
from numpy import nan
import numpy as np
Expand Down Expand Up @@ -1515,6 +1515,40 @@ def test_self_join_multiple_categories(self):

assert_frame_equal(result, df)

def test_dtype_on_categorical_dates(self):
# GH 16900
# dates should not be coerced to ints

df = pd.DataFrame(
[[date(2001, 1, 1), 1.1],
[date(2001, 1, 2), 1.3]],
columns=['date', 'num2']
)
df['date'] = df['date'].astype('category')

df2 = pd.DataFrame(
[[date(2001, 1, 1), 1.3],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need testing on inner as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use parametrize instead of duplicating code here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you can do this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this has been updated as per your previous comment:

construct the expected result and use tm.assert_frame_equal for both examples

did you want it changed to use parametrize instead?

[date(2001, 1, 3), 1.4]],
columns=['date', 'num4']
)
df2['date'] = df2['date'].astype('category')

expected_outer = pd.DataFrame([
[pd.Timestamp('2001-01-01'), 1.1, 1.3],
[pd.Timestamp('2001-01-02'), 1.3, np.nan],
[pd.Timestamp('2001-01-03'), np.nan, 1.4]],
columns=['date', 'num2', 'num4']
)
result_outer = pd.merge(df, df2, how='outer', on=['date'])
assert_frame_equal(result_outer, expected_outer)

expected_inner = pd.DataFrame(
[[pd.Timestamp('2001-01-01'), 1.1, 1.3]],
columns=['date', 'num2', 'num4']
)
result_inner = pd.merge(df, df2, how='inner', on=['date'])
assert_frame_equal(result_inner, expected_inner)


@pytest.fixture
def left_df():
Expand Down