diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index d10d51352d0e4..72ca371f6c6be 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1372,6 +1372,8 @@ Reshaping - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) - Bug in :func:`get_dummies`, and :func:`select_dtypes`, where duplicate column names caused incorrect behavior (:issue:`20848`) - Bug in :func:`isna`, which cannot handle ambiguous typed lists (:issue:`20675`) +- Bug in :func:`concat` which raises an error when concatenating TZ-aware dataframes and all-NaT dataframes (:issue:`12396`) +- Bug in :func:`concat` which raises an error when concatenating empty TZ-aware series (:issue:`18447`) Other ^^^^^ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 4aa74cdbbc2c0..9f6813bc38464 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -416,6 +416,13 @@ def _maybe_unwrap(x): fastpath=True) +def _concatenate_2d(to_concat, axis): + # coerce to 2d if needed & concatenate + if axis == 1: + to_concat = [np.atleast_2d(x) for x in to_concat] + return np.concatenate(to_concat, axis=axis) + + def _concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a @@ -432,61 +439,57 @@ def _concat_datetime(to_concat, axis=0, typs=None): a single array, preserving the combined dtypes """ - def convert_to_pydatetime(x, axis): - # coerce to an object dtype + if typs is None: + typs = get_dtype_kinds(to_concat) - # if dtype is of datetimetz or timezone - if x.dtype.kind == _NS_DTYPE.kind: - if getattr(x, 'tz', None) is not None: - x = x.astype(object).values - else: - shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), - box="timestamp") - x = x.reshape(shape) + # multiple types, need to coerce to object + if len(typs) != 1: + return _concatenate_2d([_convert_datetimelike_to_object(x) + for x in to_concat], + axis=axis) - elif x.dtype == _TD_DTYPE: - shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) - x = x.reshape(shape) + # must be single dtype + if any(typ.startswith('datetime') for typ in typs): - if axis == 1: - x = np.atleast_2d(x) - return x + if 'datetime' in typs: + to_concat = [np.array(x, copy=False).view(np.int64) + for x in to_concat] + return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE) + else: + # when to_concat has different tz, len(typs) > 1. + # thus no need to care + return _concat_datetimetz(to_concat) - if typs is None: - typs = get_dtype_kinds(to_concat) + elif 'timedelta' in typs: + return _concatenate_2d([x.view(np.int64) for x in to_concat], + axis=axis).view(_TD_DTYPE) - # must be single dtype - if len(typs) == 1: - _contains_datetime = any(typ.startswith('datetime') for typ in typs) - _contains_period = any(typ.startswith('period') for typ in typs) + elif any(typ.startswith('period') for typ in typs): + # PeriodIndex must be handled by PeriodIndex, + # Thus can't meet this condition ATM + # Must be changed when we adding PeriodDtype + raise NotImplementedError("unable to concat PeriodDtype") - if _contains_datetime: - if 'datetime' in typs: - new_values = np.concatenate([x.view(np.int64) for x in - to_concat], axis=axis) - return new_values.view(_NS_DTYPE) - else: - # when to_concat has different tz, len(typs) > 1. - # thus no need to care - return _concat_datetimetz(to_concat) - - elif 'timedelta' in typs: - new_values = np.concatenate([x.view(np.int64) for x in to_concat], - axis=axis) - return new_values.view(_TD_DTYPE) - - elif _contains_period: - # PeriodIndex must be handled by PeriodIndex, - # Thus can't meet this condition ATM - # Must be changed when we adding PeriodDtype - raise NotImplementedError - - # need to coerce to object - to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] - return np.concatenate(to_concat, axis=axis) +def _convert_datetimelike_to_object(x): + # coerce datetimelike array to object dtype + + # if dtype is of datetimetz or timezone + if x.dtype.kind == _NS_DTYPE.kind: + if getattr(x, 'tz', None) is not None: + x = x.astype(object).values + else: + shape = x.shape + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), + box="timestamp") + x = x.reshape(shape) + + elif x.dtype == _TD_DTYPE: + shape = x.shape + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) + x = x.reshape(shape) + + return x def _concat_datetimetz(to_concat, name=None): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7837faf5b4c0f..df39eb5fd8312 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2190,10 +2190,10 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) - taken = values.take(indices) - mask = indices == -1 - if mask.any(): - taken[mask] = na_value + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) else: taken = values.take(indices) return taken diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 474894aba65df..e7b2576ca1eae 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5835,7 +5835,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): if len(values) and values[0] is None: fill_value = None - if getattr(self.block, 'is_datetimetz', False): + if getattr(self.block, 'is_datetimetz', False) or \ + is_datetimetz(empty_dtype): pass elif getattr(self.block, 'is_categorical', False): pass diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 57af67422d65f..f5e58fa70e1c4 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1917,6 +1917,77 @@ def test_concat_tz_series_tzlocal(self): tm.assert_series_equal(result, pd.Series(x + y)) assert result.dtype == 'datetime64[ns, tzlocal()]' + @pytest.mark.parametrize('tz1', [None, 'UTC']) + @pytest.mark.parametrize('tz2', [None, 'UTC']) + @pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')]) + def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): + # GH 12396 + + # tz-naive + first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( + lambda x: x.dt.tz_localize(tz1)) + second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) + + result = pd.concat([first, second], axis=0) + expected = pd.DataFrame(pd.Series( + [pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) + if tz1 != tz2: + expected = expected.astype(object) + + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('tz1', [None, 'UTC']) + @pytest.mark.parametrize('tz2', [None, 'UTC']) + def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): + # GH 12396 + + first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) + second = pd.DataFrame(pd.Series( + [pd.NaT]).dt.tz_localize(tz2), columns=[1]) + expected = pd.DataFrame( + {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), + 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)} + ) + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('tz1', [None, 'UTC']) + @pytest.mark.parametrize('tz2', [None, 'UTC']) + def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): + # GH 12396 + + # tz-naive + first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) + second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)], + [pd.Timestamp('2016/01/01', tz=tz2)]], + index=[2, 3]) + + expected = pd.DataFrame([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01', tz=tz2), + pd.Timestamp('2016/01/01', tz=tz2)]) + if tz1 != tz2: + expected = expected.astype(object) + + result = pd.concat([first, second]) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_concat_NaT_dataframes(self, tz): + # GH 12396 + + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + first = first.apply(lambda x: x.dt.tz_localize(tz)) + second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)], + [pd.Timestamp('2016/01/01', tz=tz)]], + index=[2, 3]) + expected = pd.DataFrame([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01', tz=tz), + pd.Timestamp('2016/01/01', tz=tz)]) + + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expected) + def test_concat_period_series(self): x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) @@ -1978,6 +2049,21 @@ def test_concat_empty_series(self): columns=['x', 0]) tm.assert_frame_equal(res, exp) + @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize('values', [[], [1, 2, 3]]) + def test_concat_empty_series_timelike(self, tz, values): + # GH 18447 + + first = Series([], dtype='M8[ns]').dt.tz_localize(tz) + second = Series(values) + expected = DataFrame( + {0: pd.Series([pd.NaT] * len(values), + dtype='M8[ns]' + ).dt.tz_localize(tz), + 1: values}) + result = concat([first, second], axis=1) + assert_frame_equal(result, expected) + def test_default_index(self): # is_series and ignore_index s1 = pd.Series([1, 2, 3], name='x')