From f5cb71f45089d8b43e98a9f7269bac0c16c33e7f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 5 Nov 2017 12:07:16 -0500 Subject: [PATCH] COMPAT: astype(object) and dtype=object should be reflexive for Series constructor with datetimelikes closes #17449 --- doc/source/whatsnew/v0.22.0.txt | 8 ++ pandas/core/dtypes/cast.py | 10 +- pandas/core/indexes/base.py | 5 +- pandas/core/indexes/datetimes.py | 7 +- pandas/core/indexes/period.py | 8 +- pandas/core/indexes/timedeltas.py | 2 +- pandas/core/series.py | 18 +-- pandas/tests/frame/test_block_internals.py | 25 +--- pandas/tests/indexes/datetimes/test_astype.py | 10 +- pandas/tests/indexes/period/test_period.py | 8 +- .../tests/indexes/timedeltas/test_astype.py | 8 +- pandas/tests/series/test_constructors.py | 110 +++++++++++++----- pandas/tests/series/test_dtypes.py | 31 ++++- 13 files changed, 166 insertions(+), 84 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 404d9f5d972b6..9c8c9ec1611ad 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -38,11 +38,18 @@ Backwards incompatible API changes - - + + + + + .. _whatsnew_0220.api: Other API Changes ^^^^^^^^^^^^^^^^^ +- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) +- ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) @@ -80,6 +87,7 @@ Performance Improvements - :class`DateOffset` arithmetic performance is improved (:issue:`18218`) - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) - The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) +- ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) .. _whatsnew_0220.docs: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index eae283e9bc00d..4b99914758d55 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -13,7 +13,9 @@ is_datetimelike, is_extension_type, is_object_dtype, is_datetime64tz_dtype, is_datetime64_dtype, - is_timedelta64_dtype, is_dtype_equal, + is_datetime64_ns_dtype, + is_timedelta64_dtype, is_timedelta64_ns_dtype, + is_dtype_equal, is_float_dtype, is_complex_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype, @@ -829,8 +831,10 @@ def maybe_castable(arr): # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce kind = arr.dtype.kind - if kind == 'M' or kind == 'm': - return is_datetime64_dtype(arr.dtype) + if kind == 'M': + return is_datetime64_ns_dtype(arr.dtype) + elif kind == 'm': + return is_timedelta64_ns_dtype(arr.dtype) return arr.dtype.name not in _POSSIBLY_CAST_DTYPES diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1359a938e652d..50f03aca97447 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1027,13 +1027,16 @@ def to_frame(self, index=True): result.index = self return result - def _to_embed(self, keep_tz=False): + def _to_embed(self, keep_tz=False, dtype=None): """ *this is an internal non-public method* return an array repr of this object, potentially casting to object """ + if dtype is not None: + return self.astype(dtype)._to_embed(keep_tz=keep_tz) + return self.values.copy() _index_shared_docs['astype'] = """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 64b5b9f958880..3a11c80ecba64 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -917,7 +917,7 @@ def astype(self, dtype, copy=True): return Index(self.format(), name=self.name, dtype=object) elif is_period_dtype(dtype): return self.to_period(freq=dtype.freq) - raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) + raise TypeError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): values = self.asi8 @@ -957,12 +957,15 @@ def to_series(self, keep_tz=False): index=self._shallow_copy(), name=self.name) - def _to_embed(self, keep_tz=False): + def _to_embed(self, keep_tz=False, dtype=None): """ return an array repr of this object, potentially casting to object This is for internal compat """ + if dtype is not None: + return self.astype(dtype)._to_embed(keep_tz=keep_tz) + if keep_tz and self.tz is not None: # preserve the tz & copy diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 85e3300913000..76004994ae38a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -464,10 +464,14 @@ def __array_wrap__(self, result, context=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - def _to_embed(self, keep_tz=False): + def _to_embed(self, keep_tz=False, dtype=None): """ return an array repr of this object, potentially casting to object """ + + if dtype is not None: + return self.astype(dtype)._to_embed(keep_tz=keep_tz) + return self.asobject.values @property @@ -510,7 +514,7 @@ def astype(self, dtype, copy=True, how='start'): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) - raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) + raise TypeError('Cannot cast PeriodIndex to dtype %s' % dtype) @Substitution(klass='PeriodIndex') @Appender(_shared_docs['searchsorted']) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c592aa9608d97..eb4a9ce7e1439 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -495,7 +495,7 @@ def astype(self, dtype, copy=True): elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), dtype='i8', name=self.name) - raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype) + raise TypeError('Cannot cast TimedeltaIndex to dtype %s' % dtype) def union(self, other): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 6142ccdd2f2ac..be1de4c6814ba 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,8 @@ _is_unorderable_exception, _ensure_platform_int, pandas_dtype) -from pandas.core.dtypes.generic import ABCSparseArray, ABCDataFrame +from pandas.core.dtypes.generic import ( + ABCSparseArray, ABCDataFrame, ABCIndexClass) from pandas.core.dtypes.cast import ( maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, @@ -184,8 +185,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if name is None: name = data.name - data = data._to_embed(keep_tz=True) - copy = True + data = data._to_embed(keep_tz=True, dtype=dtype) + copy = False elif isinstance(data, np.ndarray): pass elif isinstance(data, Series): @@ -3139,7 +3140,9 @@ def _sanitize_index(data, index, copy=False): if len(data) != len(index): raise ValueError('Length of values does not match length of ' 'index') - if isinstance(data, PeriodIndex): + if isinstance(data, ABCIndexClass) and not copy: + pass + elif isinstance(data, PeriodIndex): data = data.asobject elif isinstance(data, DatetimeIndex): data = data._to_embed(keep_tz=True) @@ -3209,12 +3212,11 @@ def _try_cast(arr, take_fast_path): # e.g. indexes can have different conversions (so don't fast path # them) # GH 6140 - subarr = _sanitize_index(data, index, copy=True) + subarr = _sanitize_index(data, index, copy=copy) else: - subarr = _try_cast(data, True) - if copy: - subarr = data.copy() + # we will try to copy be-definition here + subarr = _try_cast(data, True) elif isinstance(data, Categorical): subarr = data diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 3ca185cf158a7..c29821ba51284 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -233,10 +233,6 @@ def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64 arr = np.array([1, 2, 3], dtype='timedelta64[s]') - s = Series(arr) - expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) - assert_series_equal(s, expected) - df = DataFrame(index=range(3)) df['A'] = arr expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3, @@ -244,21 +240,6 @@ def test_construction_with_conversions(self): index=range(3)) assert_frame_equal(df, expected) - # convert from a numpy array of non-ns datetime64 - # note that creating a numpy datetime64 is in LOCAL time!!!! - # seems to work for M8[D], but not for M8[s] - - s = Series(np.array(['2013-01-01', '2013-01-02', - '2013-01-03'], dtype='datetime64[D]')) - assert_series_equal(s, Series(date_range('20130101', periods=3, - freq='D'))) - - # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 - # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) - - # assert_series_equal(s,date_range('20130101 - # 00:00:01',period=3,freq='s')) - expected = DataFrame({ 'dt1': Timestamp('20130101'), 'dt2': date_range('20130101', periods=3), @@ -467,7 +448,7 @@ def test_convert_objects(self): self.mixed_frame['I'] = '1' # add in some items that will be nan - l = len(self.mixed_frame) + length = len(self.mixed_frame) self.mixed_frame['J'] = '1.' self.mixed_frame['K'] = '1' self.mixed_frame.loc[0:5, ['J', 'K']] = 'garbled' @@ -476,8 +457,8 @@ def test_convert_objects(self): assert converted['I'].dtype == 'int64' assert converted['J'].dtype == 'float64' assert converted['K'].dtype == 'float64' - assert len(converted['J'].dropna()) == l - 5 - assert len(converted['K'].dropna()) == l - 5 + assert len(converted['J'].dropna()) == length - 5 + assert len(converted['K'].dropna()) == length - 5 # via astype converted = self.mixed_frame.copy() diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 0197fc4c52617..e211807b6a3e4 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -130,11 +130,11 @@ def test_astype_raises(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - pytest.raises(ValueError, idx.astype, float) - pytest.raises(ValueError, idx.astype, 'timedelta64') - pytest.raises(ValueError, idx.astype, 'timedelta64[ns]') - pytest.raises(ValueError, idx.astype, 'datetime64') - pytest.raises(ValueError, idx.astype, 'datetime64[D]') + pytest.raises(TypeError, idx.astype, float) + pytest.raises(TypeError, idx.astype, 'timedelta64') + pytest.raises(TypeError, idx.astype, 'timedelta64[ns]') + pytest.raises(TypeError, idx.astype, 'datetime64') + pytest.raises(TypeError, idx.astype, 'datetime64[D]') def test_index_convert_to_datetime_array(self): def _check_rng(rng): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index e5ee078d3558d..7fefcc859d447 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -47,10 +47,10 @@ def test_astype_raises(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - pytest.raises(ValueError, idx.astype, str) - pytest.raises(ValueError, idx.astype, float) - pytest.raises(ValueError, idx.astype, 'timedelta64') - pytest.raises(ValueError, idx.astype, 'timedelta64[ns]') + pytest.raises(TypeError, idx.astype, str) + pytest.raises(TypeError, idx.astype, float) + pytest.raises(TypeError, idx.astype, 'timedelta64') + pytest.raises(TypeError, idx.astype, 'timedelta64[ns]') def test_pickle_compat_construction(self): pass diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index 7a761cfe30c62..0fa0e036096d0 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -66,10 +66,10 @@ def test_astype_raises(self): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) - pytest.raises(ValueError, idx.astype, float) - pytest.raises(ValueError, idx.astype, str) - pytest.raises(ValueError, idx.astype, 'datetime64') - pytest.raises(ValueError, idx.astype, 'datetime64[ns]') + pytest.raises(TypeError, idx.astype, float) + pytest.raises(TypeError, idx.astype, str) + pytest.raises(TypeError, idx.astype, 'datetime64') + pytest.raises(TypeError, idx.astype, 'datetime64[ns]') def test_pickle_compat_construction(self): pass diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d296086021349..e62b19294a07b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,9 +14,9 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64tz_dtype) -from pandas import (Index, Series, isna, date_range, - NaT, period_range, MultiIndex, IntervalIndex) -from pandas.core.indexes.datetimes import Timestamp, DatetimeIndex +from pandas import (Index, Series, isna, date_range, Timestamp, + NaT, period_range, timedelta_range, MultiIndex, + IntervalIndex) from pandas._libs import lib from pandas._libs.tslib import iNaT @@ -289,6 +289,25 @@ def test_constructor_copy(self): assert x[0] == 2. assert y[0] == 1. + @pytest.mark.parametrize( + "index", + [ + pd.date_range('20170101', periods=3, tz='US/Eastern'), + pd.date_range('20170101', periods=3), + pd.timedelta_range('1 day', periods=3), + pd.period_range('2012Q1', periods=3, freq='Q'), + pd.Index(list('abc')), + pd.Int64Index([1, 2, 3]), + pd.RangeIndex(0, 3)], + ids=lambda x: type(x).__name__) + def test_constructor_limit_copies(self, index): + # GH 17449 + # limit copies of input + s = pd.Series(index) + + # we make 1 copy; this is just a smoke test here + assert s._data.blocks[0].values is not index + def test_constructor_pass_none(self): s = Series(None, index=lrange(5)) assert s.dtype == np.float64 @@ -524,25 +543,6 @@ def test_constructor_with_datetime_tz(self): result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) - # astype - result = s.astype(object) - expected = Series(DatetimeIndex(s._values).asobject) - assert_series_equal(result, expected) - - result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) - assert_series_equal(result, s) - - # astype - datetime64[ns, tz] - result = Series(s.values).astype('datetime64[ns, US/Eastern]') - assert_series_equal(result, s) - - result = Series(s.values).astype(s.dtype) - assert_series_equal(result, s) - - result = s.astype('datetime64[ns, CET]') - expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) - assert_series_equal(result, expected) - # short str assert 'datetime64[ns, US/Eastern]' in str(s) @@ -807,17 +807,67 @@ def test_auto_conversion(self): series = Series(list(date_range('1/1/2000', periods=10))) assert series.dtype == 'M8[ns]' - def test_constructor_cant_cast_datetime64(self): - msg = "Cannot cast datetime64 to " - with tm.assert_raises_regex(TypeError, msg): - Series(date_range('1/1/2000', periods=10), dtype=float) + def test_convert_non_ns(self): + # convert from a numpy array of non-ns timedelta64 + arr = np.array([1, 2, 3], dtype='timedelta64[s]') + s = Series(arr) + expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) + assert_series_equal(s, expected) + + # convert from a numpy array of non-ns datetime64 + # note that creating a numpy datetime64 is in LOCAL time!!!! + # seems to work for M8[D], but not for M8[s] + + s = Series(np.array(['2013-01-01', '2013-01-02', + '2013-01-03'], dtype='datetime64[D]')) + assert_series_equal(s, Series(date_range('20130101', periods=3, + freq='D'))) + + # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 + # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) + + # assert_series_equal(s,date_range('20130101 + # 00:00:01',period=3,freq='s')) + + @pytest.mark.parametrize( + "index", + [ + date_range('1/1/2000', periods=10), + timedelta_range('1 day', periods=10), + period_range('2000-Q1', periods=10, freq='Q')], + ids=lambda x: type(x).__name__) + def test_constructor_cant_cast_datetimelike(self, index): + # floats are not ok + msg = "Cannot cast {} to ".format(type(index).__name__) with tm.assert_raises_regex(TypeError, msg): - Series(date_range('1/1/2000', periods=10), dtype=int) + Series(index, dtype=float) + + # ints are ok + # we test with np.int64 to get similar results on + # windows / 32-bit platforms + result = Series(index, dtype=np.int64) + expected = Series(index.astype(np.int64)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + date_range('1/1/2000', periods=10), + timedelta_range('1 day', periods=10), + period_range('2000-Q1', periods=10, freq='Q')], + ids=lambda x: type(x).__name__) + def test_constructor_cast_object(self, index): + s = Series(index, dtype=object) + exp = Series(index).astype(object) + tm.assert_series_equal(s, exp) + + s = Series(pd.Index(index, dtype=object), dtype=object) + exp = Series(index).astype(object) + tm.assert_series_equal(s, exp) - def test_constructor_cast_object(self): - s = Series(date_range('1/1/2000', periods=10), dtype=object) - exp = Series(date_range('1/1/2000', periods=10)) + s = Series(index.astype(object), dtype=object) + exp = Series(index).astype(object) tm.assert_series_equal(s, exp) def test_constructor_generic_timestamp_deprecated(self): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b20c1817e5671..ad6d019b5287e 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -17,6 +17,7 @@ Categorical, Index ) from pandas.api.types import CategoricalDtype +import pandas._libs.tslib as tslib from pandas.compat import lrange, range, u from pandas import compat @@ -69,8 +70,7 @@ def test_astype_cast_object_int(self): tm.assert_series_equal(result, Series(np.arange(1, 5))) - def test_astype_datetimes(self): - import pandas._libs.tslib as tslib + def test_astype_datetime(self): s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) s = s.astype('O') @@ -89,6 +89,33 @@ def test_astype_datetimes(self): s = s.astype('O') assert s.dtype == np.object_ + def test_astype_datetime64tz(self): + s = Series(date_range('20130101', periods=3, tz='US/Eastern')) + + # astype + result = s.astype(object) + expected = Series(s.astype(object), dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) + tm.assert_series_equal(result, s) + + # astype - object, preserves on construction + result = Series(s.astype(object)) + expected = s.astype(object) + tm.assert_series_equal(result, expected) + + # astype - datetime64[ns, tz] + result = Series(s.values).astype('datetime64[ns, US/Eastern]') + tm.assert_series_equal(result, s) + + result = Series(s.values).astype(s.dtype) + tm.assert_series_equal(result, s) + + result = s.astype('datetime64[ns, CET]') + expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("dtype", [compat.text_type, np.str_]) @pytest.mark.parametrize("series", [Series([string.digits * 10, tm.rands(63),