Skip to content

BUG: Concatentation of TZ-aware dataframes #21014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 +1372,8 @@ Reshaping
- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
- Bug in :func:`get_dummies`, and :func:`select_dtypes`, where duplicate column names caused incorrect behavior (:issue:`20848`)
- Bug in :func:`isna`, which cannot handle ambiguous typed lists (:issue:`20675`)
- Bug in :func:`concat` which raises an error when concatenating TZ-aware dataframes and all-NaT dataframes (:issue:`12396`)
- Bug in :func:`concat` which raises an error when concatenating empty TZ-aware series (:issue:`18447`)

Other
^^^^^
Expand Down
99 changes: 51 additions & 48 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,13 @@ def _maybe_unwrap(x):
fastpath=True)


def _concatenate_2d(to_concat, axis):
# coerce to 2d if needed & concatenate
if axis == 1:
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)


def _concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
Expand All @@ -432,61 +439,57 @@ def _concat_datetime(to_concat, axis=0, typs=None):
a single array, preserving the combined dtypes
"""

def convert_to_pydatetime(x, axis):
# coerce to an object dtype
if typs is None:
typs = get_dtype_kinds(to_concat)

# if dtype is of datetimetz or timezone
if x.dtype.kind == _NS_DTYPE.kind:
if getattr(x, 'tz', None) is not None:
x = x.astype(object).values
else:
shape = x.shape
x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(),
box="timestamp")
x = x.reshape(shape)
# multiple types, need to coerce to object
if len(typs) != 1:
return _concatenate_2d([_convert_datetimelike_to_object(x)
for x in to_concat],
axis=axis)

elif x.dtype == _TD_DTYPE:
shape = x.shape
x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
x = x.reshape(shape)
# must be single dtype
if any(typ.startswith('datetime') for typ in typs):

if axis == 1:
x = np.atleast_2d(x)
return x
if 'datetime' in typs:
to_concat = [np.array(x, copy=False).view(np.int64)
for x in to_concat]
return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)
else:
# when to_concat has different tz, len(typs) > 1.
# thus no need to care
return _concat_datetimetz(to_concat)

if typs is None:
typs = get_dtype_kinds(to_concat)
elif 'timedelta' in typs:
return _concatenate_2d([x.view(np.int64) for x in to_concat],
axis=axis).view(_TD_DTYPE)

# must be single dtype
if len(typs) == 1:
_contains_datetime = any(typ.startswith('datetime') for typ in typs)
_contains_period = any(typ.startswith('period') for typ in typs)
elif any(typ.startswith('period') for typ in typs):
# PeriodIndex must be handled by PeriodIndex,
# Thus can't meet this condition ATM
# Must be changed when we adding PeriodDtype
raise NotImplementedError("unable to concat PeriodDtype")

if _contains_datetime:

if 'datetime' in typs:
new_values = np.concatenate([x.view(np.int64) for x in
to_concat], axis=axis)
return new_values.view(_NS_DTYPE)
else:
# when to_concat has different tz, len(typs) > 1.
# thus no need to care
return _concat_datetimetz(to_concat)

elif 'timedelta' in typs:
new_values = np.concatenate([x.view(np.int64) for x in to_concat],
axis=axis)
return new_values.view(_TD_DTYPE)

elif _contains_period:
# PeriodIndex must be handled by PeriodIndex,
# Thus can't meet this condition ATM
# Must be changed when we adding PeriodDtype
raise NotImplementedError

# need to coerce to object
to_concat = [convert_to_pydatetime(x, axis) for x in to_concat]
return np.concatenate(to_concat, axis=axis)
def _convert_datetimelike_to_object(x):
# coerce datetimelike array to object dtype

# if dtype is of datetimetz or timezone
if x.dtype.kind == _NS_DTYPE.kind:
if getattr(x, 'tz', None) is not None:
x = x.astype(object).values
else:
shape = x.shape
x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(),
box="timestamp")
x = x.reshape(shape)

elif x.dtype == _TD_DTYPE:
shape = x.shape
x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
x = x.reshape(shape)

return x


def _concat_datetimetz(to_concat, name=None):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2190,10 +2190,10 @@ def _assert_take_fillable(self, values, indices, allow_fill=True,
msg = ('When allow_fill=True and fill_value is not None, '
'all indices must be >= -1')
raise ValueError(msg)
taken = values.take(indices)
mask = indices == -1
if mask.any():
taken[mask] = na_value
taken = algos.take(values,
indices,
allow_fill=allow_fill,
fill_value=na_value)
else:
taken = values.take(indices)
return taken
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5835,7 +5835,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
if len(values) and values[0] is None:
fill_value = None

if getattr(self.block, 'is_datetimetz', False):
if getattr(self.block, 'is_datetimetz', False) or \
is_datetimetz(empty_dtype):
pass
elif getattr(self.block, 'is_categorical', False):
pass
Expand Down
86 changes: 86 additions & 0 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1917,6 +1917,77 @@ def test_concat_tz_series_tzlocal(self):
tm.assert_series_equal(result, pd.Series(x + y))
assert result.dtype == 'datetime64[ns, tzlocal()]'

@pytest.mark.parametrize('tz1', [None, 'UTC'])
@pytest.mark.parametrize('tz2', [None, 'UTC'])
@pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')])
def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s):
# GH 12396

# tz-naive
first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply(
lambda x: x.dt.tz_localize(tz1))
second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2))

result = pd.concat([first, second], axis=0)
expected = pd.DataFrame(pd.Series(
[pd.NaT, pd.NaT, s], index=[0, 1, 0]))
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
if tz1 != tz2:
expected = expected.astype(object)

assert_frame_equal(result, expected)

@pytest.mark.parametrize('tz1', [None, 'UTC'])
@pytest.mark.parametrize('tz2', [None, 'UTC'])
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
# GH 12396

first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
second = pd.DataFrame(pd.Series(
[pd.NaT]).dt.tz_localize(tz2), columns=[1])
expected = pd.DataFrame(
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)}
)
result = pd.concat([first, second], axis=1)
assert_frame_equal(result, expected)

@pytest.mark.parametrize('tz1', [None, 'UTC'])
@pytest.mark.parametrize('tz2', [None, 'UTC'])
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
# GH 12396

# tz-naive
first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)],
[pd.Timestamp('2016/01/01', tz=tz2)]],
index=[2, 3])

expected = pd.DataFrame([pd.NaT, pd.NaT,
pd.Timestamp('2015/01/01', tz=tz2),
pd.Timestamp('2016/01/01', tz=tz2)])
if tz1 != tz2:
expected = expected.astype(object)

result = pd.concat([first, second])
assert_frame_equal(result, expected)

@pytest.mark.parametrize('tz', [None, 'UTC'])
def test_concat_NaT_dataframes(self, tz):
# GH 12396

first = pd.DataFrame([[pd.NaT], [pd.NaT]])
first = first.apply(lambda x: x.dt.tz_localize(tz))
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)],
[pd.Timestamp('2016/01/01', tz=tz)]],
index=[2, 3])
expected = pd.DataFrame([pd.NaT, pd.NaT,
pd.Timestamp('2015/01/01', tz=tz),
pd.Timestamp('2016/01/01', tz=tz)])

result = pd.concat([first, second], axis=0)
assert_frame_equal(result, expected)

def test_concat_period_series(self):
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D'))
Expand Down Expand Up @@ -1978,6 +2049,21 @@ def test_concat_empty_series(self):
columns=['x', 0])
tm.assert_frame_equal(res, exp)

@pytest.mark.parametrize('tz', [None, 'UTC'])
@pytest.mark.parametrize('values', [[], [1, 2, 3]])
def test_concat_empty_series_timelike(self, tz, values):
# GH 18447

first = Series([], dtype='M8[ns]').dt.tz_localize(tz)
second = Series(values)
expected = DataFrame(
{0: pd.Series([pd.NaT] * len(values),
dtype='M8[ns]'
).dt.tz_localize(tz),
1: values})
result = concat([first, second], axis=1)
assert_frame_equal(result, expected)

def test_default_index(self):
# is_series and ignore_index
s1 = pd.Series([1, 2, 3], name='x')
Expand Down