Skip to content

Reduce copying of input data on Series construction #18231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,18 @@ Backwards incompatible API changes
-
-






.. _whatsnew_0220.api:

Other API Changes
^^^^^^^^^^^^^^^^^

- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`)
- ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`)
- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`).
- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`)
Expand Down Expand Up @@ -80,6 +87,7 @@ Performance Improvements
- :class`DateOffset` arithmetic performance is improved (:issue:`18218`)
- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`)
- The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`)
- ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`)

.. _whatsnew_0220.docs:

Expand Down
10 changes: 7 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
is_datetimelike,
is_extension_type, is_object_dtype,
is_datetime64tz_dtype, is_datetime64_dtype,
is_timedelta64_dtype, is_dtype_equal,
is_datetime64_ns_dtype,
is_timedelta64_dtype, is_timedelta64_ns_dtype,
is_dtype_equal,
is_float_dtype, is_complex_dtype,
is_integer_dtype,
is_datetime_or_timedelta_dtype,
Expand Down Expand Up @@ -829,8 +831,10 @@ def maybe_castable(arr):
# check datetime64[ns]/timedelta64[ns] are valid
# otherwise try to coerce
kind = arr.dtype.kind
if kind == 'M' or kind == 'm':
return is_datetime64_dtype(arr.dtype)
if kind == 'M':
return is_datetime64_ns_dtype(arr.dtype)
elif kind == 'm':
return is_timedelta64_ns_dtype(arr.dtype)

return arr.dtype.name not in _POSSIBLY_CAST_DTYPES

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,13 +1027,16 @@ def to_frame(self, index=True):
result.index = self
return result

def _to_embed(self, keep_tz=False):
def _to_embed(self, keep_tz=False, dtype=None):
"""
*this is an internal non-public method*

return an array repr of this object, potentially casting to object

"""
if dtype is not None:
return self.astype(dtype)._to_embed(keep_tz=keep_tz)

return self.values.copy()

_index_shared_docs['astype'] = """
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,7 @@ def astype(self, dtype, copy=True):
return Index(self.format(), name=self.name, dtype=object)
elif is_period_dtype(dtype):
return self.to_period(freq=dtype.freq)
raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype)
raise TypeError('Cannot cast DatetimeIndex to dtype %s' % dtype)

def _get_time_micros(self):
values = self.asi8
Expand Down Expand Up @@ -957,12 +957,15 @@ def to_series(self, keep_tz=False):
index=self._shallow_copy(),
name=self.name)

def _to_embed(self, keep_tz=False):
def _to_embed(self, keep_tz=False, dtype=None):
"""
return an array repr of this object, potentially casting to object

This is for internal compat
"""
if dtype is not None:
return self.astype(dtype)._to_embed(keep_tz=keep_tz)

if keep_tz and self.tz is not None:

# preserve the tz & copy
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,10 +464,14 @@ def __array_wrap__(self, result, context=None):
def _box_func(self):
return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq)

def _to_embed(self, keep_tz=False):
def _to_embed(self, keep_tz=False, dtype=None):
"""
return an array repr of this object, potentially casting to object
"""

if dtype is not None:
return self.astype(dtype)._to_embed(keep_tz=keep_tz)

return self.asobject.values

@property
Expand Down Expand Up @@ -510,7 +514,7 @@ def astype(self, dtype, copy=True, how='start'):
return self.to_timestamp(how=how).tz_localize(dtype.tz)
elif is_period_dtype(dtype):
return self.asfreq(freq=dtype.freq)
raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
raise TypeError('Cannot cast PeriodIndex to dtype %s' % dtype)

@Substitution(klass='PeriodIndex')
@Appender(_shared_docs['searchsorted'])
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def astype(self, dtype, copy=True):
elif is_integer_dtype(dtype):
return Index(self.values.astype('i8', copy=copy), dtype='i8',
name=self.name)
raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype)
raise TypeError('Cannot cast TimedeltaIndex to dtype %s' % dtype)

def union(self, other):
"""
Expand Down
18 changes: 10 additions & 8 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
_is_unorderable_exception,
_ensure_platform_int,
pandas_dtype)
from pandas.core.dtypes.generic import ABCSparseArray, ABCDataFrame
from pandas.core.dtypes.generic import (
ABCSparseArray, ABCDataFrame, ABCIndexClass)
from pandas.core.dtypes.cast import (
maybe_upcast, infer_dtype_from_scalar,
maybe_convert_platform,
Expand Down Expand Up @@ -184,8 +185,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
if name is None:
name = data.name

data = data._to_embed(keep_tz=True)
copy = True
data = data._to_embed(keep_tz=True, dtype=dtype)
copy = False
elif isinstance(data, np.ndarray):
pass
elif isinstance(data, Series):
Expand Down Expand Up @@ -3139,7 +3140,9 @@ def _sanitize_index(data, index, copy=False):
if len(data) != len(index):
raise ValueError('Length of values does not match length of ' 'index')

if isinstance(data, PeriodIndex):
if isinstance(data, ABCIndexClass) and not copy:
pass
elif isinstance(data, PeriodIndex):
data = data.asobject
elif isinstance(data, DatetimeIndex):
data = data._to_embed(keep_tz=True)
Expand Down Expand Up @@ -3209,12 +3212,11 @@ def _try_cast(arr, take_fast_path):
# e.g. indexes can have different conversions (so don't fast path
# them)
# GH 6140
subarr = _sanitize_index(data, index, copy=True)
subarr = _sanitize_index(data, index, copy=copy)
else:
subarr = _try_cast(data, True)

if copy:
subarr = data.copy()
# we will try to copy be-definition here
subarr = _try_cast(data, True)

elif isinstance(data, Categorical):
subarr = data
Expand Down
25 changes: 3 additions & 22 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,32 +233,13 @@ def test_construction_with_conversions(self):

# convert from a numpy array of non-ns timedelta64
arr = np.array([1, 2, 3], dtype='timedelta64[s]')
s = Series(arr)
expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s'))
assert_series_equal(s, expected)

df = DataFrame(index=range(3))
df['A'] = arr
expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
freq='s')},
index=range(3))
assert_frame_equal(df, expected)

# convert from a numpy array of non-ns datetime64
# note that creating a numpy datetime64 is in LOCAL time!!!!
# seems to work for M8[D], but not for M8[s]

s = Series(np.array(['2013-01-01', '2013-01-02',
'2013-01-03'], dtype='datetime64[D]'))
assert_series_equal(s, Series(date_range('20130101', periods=3,
freq='D')))

# s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

# assert_series_equal(s,date_range('20130101
# 00:00:01',period=3,freq='s'))

expected = DataFrame({
'dt1': Timestamp('20130101'),
'dt2': date_range('20130101', periods=3),
Expand Down Expand Up @@ -467,7 +448,7 @@ def test_convert_objects(self):
self.mixed_frame['I'] = '1'

# add in some items that will be nan
l = len(self.mixed_frame)
length = len(self.mixed_frame)
self.mixed_frame['J'] = '1.'
self.mixed_frame['K'] = '1'
self.mixed_frame.loc[0:5, ['J', 'K']] = 'garbled'
Expand All @@ -476,8 +457,8 @@ def test_convert_objects(self):
assert converted['I'].dtype == 'int64'
assert converted['J'].dtype == 'float64'
assert converted['K'].dtype == 'float64'
assert len(converted['J'].dropna()) == l - 5
assert len(converted['K'].dropna()) == l - 5
assert len(converted['J'].dropna()) == length - 5
assert len(converted['K'].dropna()) == length - 5

# via astype
converted = self.mixed_frame.copy()
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/indexes/datetimes/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ def test_astype_raises(self):
# GH 13149, GH 13209
idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN])

pytest.raises(ValueError, idx.astype, float)
pytest.raises(ValueError, idx.astype, 'timedelta64')
pytest.raises(ValueError, idx.astype, 'timedelta64[ns]')
pytest.raises(ValueError, idx.astype, 'datetime64')
pytest.raises(ValueError, idx.astype, 'datetime64[D]')
pytest.raises(TypeError, idx.astype, float)
pytest.raises(TypeError, idx.astype, 'timedelta64')
pytest.raises(TypeError, idx.astype, 'timedelta64[ns]')
pytest.raises(TypeError, idx.astype, 'datetime64')
pytest.raises(TypeError, idx.astype, 'datetime64[D]')

def test_index_convert_to_datetime_array(self):
def _check_rng(rng):
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/indexes/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ def test_astype_raises(self):
# GH 13149, GH 13209
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')

pytest.raises(ValueError, idx.astype, str)
pytest.raises(ValueError, idx.astype, float)
pytest.raises(ValueError, idx.astype, 'timedelta64')
pytest.raises(ValueError, idx.astype, 'timedelta64[ns]')
pytest.raises(TypeError, idx.astype, str)
pytest.raises(TypeError, idx.astype, float)
pytest.raises(TypeError, idx.astype, 'timedelta64')
pytest.raises(TypeError, idx.astype, 'timedelta64[ns]')

def test_pickle_compat_construction(self):
pass
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/indexes/timedeltas/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ def test_astype_raises(self):
# GH 13149, GH 13209
idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN])

pytest.raises(ValueError, idx.astype, float)
pytest.raises(ValueError, idx.astype, str)
pytest.raises(ValueError, idx.astype, 'datetime64')
pytest.raises(ValueError, idx.astype, 'datetime64[ns]')
pytest.raises(TypeError, idx.astype, float)
pytest.raises(TypeError, idx.astype, str)
pytest.raises(TypeError, idx.astype, 'datetime64')
pytest.raises(TypeError, idx.astype, 'datetime64[ns]')

def test_pickle_compat_construction(self):
pass
Expand Down
Loading