diff --git a/doc/source/release.rst b/doc/source/release.rst index 3f3e3e87133a0..cd5b0cbd23353 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -208,6 +208,7 @@ Improvements to existing features - Performance improvement when converting ``DatetimeIndex`` to floating ordinals using ``DatetimeConverter`` (:issue:`6636`) - Performance improvement for ``DataFrame.shift`` (:issue:`5609`) +- Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/core/common.py b/pandas/core/common.py index b33ee6d66f901..84d22a31531f8 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2130,6 +2130,16 @@ def is_timedelta64_dtype(arr_or_dtype): return issubclass(tipo, np.timedelta64) +def is_timedelta64_ns_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype.type + elif isinstance(arr_or_dtype, type): + tipo = np.dtype(arr_or_dtype).type + else: + tipo = arr_or_dtype.dtype.type + return tipo == _TD_DTYPE + + def needs_i8_conversion(arr_or_dtype): return (is_datetime64_dtype(arr_or_dtype) or is_timedelta64_dtype(arr_or_dtype)) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index c490aee134a1a..341feec67fb9b 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -173,6 +173,36 @@ def conv(v): expected = np.timedelta64(timedelta(seconds=1)) self.assertEqual(result, expected) + # arrays of various dtypes + arr = np.array([1]*5,dtype='int64') + result = to_timedelta(arr,unit='s') + expected = Series([ np.timedelta64(1,'s') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='int64') + result = to_timedelta(arr,unit='m') + expected = Series([ np.timedelta64(1,'m') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='int64') + result = to_timedelta(arr,unit='h') + expected = Series([ np.timedelta64(1,'h') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='timedelta64[s]') + result = to_timedelta(arr) + expected = Series([ np.timedelta64(1,'s') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='timedelta64[D]') + result = to_timedelta(arr) + expected = Series([ np.timedelta64(1,'D') ]*5) + tm.assert_series_equal(result, expected) + + # these will error + self.assertRaises(ValueError, lambda : to_timedelta(['1h'])) + self.assertRaises(ValueError, lambda : to_timedelta(['1m'])) + def test_to_timedelta_via_apply(self): _skip_if_numpy_not_friendly() diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 4a522d9874c4f..78dbd246648c8 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -8,7 +8,7 @@ import numpy as np import pandas.tslib as tslib from pandas import compat, _np_version_under1p7 -from pandas.core.common import (ABCSeries, is_integer, is_timedelta64_dtype, +from pandas.core.common import (ABCSeries, is_integer, is_integer_dtype, is_timedelta64_dtype, _values_from_object, is_list_like, isnull) repr_timedelta = tslib.repr_timedelta64 @@ -23,7 +23,7 @@ def to_timedelta(arg, box=True, unit='ns'): arg : string, timedelta, array of strings (with possible NAs) box : boolean, default True If True returns a Series of the results, if False returns ndarray of values - unit : unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer/float number + unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number Returns ------- @@ -32,18 +32,22 @@ def to_timedelta(arg, box=True, unit='ns'): if _np_version_under1p7: raise ValueError("to_timedelta is not support for numpy < 1.7") - def _convert_listlike(arg, box): + def _convert_listlike(arg, box, unit): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if is_timedelta64_dtype(arg): - if box: - from pandas import Series - return Series(arg,dtype='m8[ns]') - return arg + value = arg.astype('timedelta64[ns]') + elif is_integer_dtype(arg): + # these are shortcutable + value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]') + else: + try: + value = tslib.array_to_timedelta64(_ensure_object(arg),unit=unit) + except: + value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit) for r in arg ]) - value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit) for r in arg ]) if box: from pandas import Series value = Series(value,dtype='m8[ns]') @@ -53,10 +57,10 @@ def _convert_listlike(arg, box): return arg elif isinstance(arg, ABCSeries): from pandas import Series - values = _convert_listlike(arg.values, box=False) + values = _convert_listlike(arg.values, box=False, unit=unit) return Series(values, index=arg.index, name=arg.name, dtype='m8[ns]') elif is_list_like(arg): - return _convert_listlike(arg, box=box) + return _convert_listlike(arg, box=box, unit=unit) # ...so it must be a scalar value. Return scalar. return _coerce_scalar_to_timedelta_type(arg, unit=unit) @@ -139,7 +143,7 @@ def convert(r=None, unit=None, m=m): return convert # no converter - raise ValueError("cannot create timedelta string converter") + raise ValueError("cannot create timedelta string converter for [{0}]".format(r)) def _possibly_cast_to_timedelta(value, coerce=True): """ try to cast to timedelta64, if already a timedeltalike, then make diff --git a/vb_suite/suite.py b/vb_suite/suite.py index 1b845e88a9d79..03f85da698ff8 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -26,6 +26,7 @@ 'reshape', 'stat_ops', 'timeseries', + 'timedelta', 'eval'] by_module = {} diff --git a/vb_suite/timedelta.py b/vb_suite/timedelta.py new file mode 100644 index 0000000000000..febd70739b2c9 --- /dev/null +++ b/vb_suite/timedelta.py @@ -0,0 +1,32 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +from pandas import to_timedelta +""" + +#---------------------------------------------------------------------- +# conversion + +setup = common_setup + """ +arr = np.random.randint(0,1000,size=10000) +""" + +stmt = "to_timedelta(arr,unit='s')" +timedelta_convert_int = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1)) + +setup = common_setup + """ +arr = np.random.randint(0,1000,size=10000) +arr = [ '{0} days'.format(i) for i in arr ] +""" + +stmt = "to_timedelta(arr)" +timedelta_convert_string = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1)) + +setup = common_setup + """ +arr = np.random.randint(0,60,size=10000) +arr = [ '00:00:{0:02d}'.format(i) for i in arr ] +""" + +stmt = "to_timedelta(arr)" +timedelta_convert_string_seconds = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1)) diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index fafa7f75501d9..ccd4bd7ae371a 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -278,7 +278,7 @@ def date_range(start=None, end=None, periods=None, freq=None): """ datetimeindex_converter = \ - Benchmark('DatetimeConverter.convert(rng, None, None)', + Benchmark('DatetimeConverter.convert(rng, None, None)', setup, start_date=datetime(2013, 1, 1)) # Adding custom business day