Skip to content

API: add DatetimeBlockTZ #8260 #10477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "",
"environment_type": "conda",

// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/pydata/pandas/commit/",

// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
// "pythons": ["2.7", "3.4"],
"pythons": ["2.7", "3.4"],
"pythons": ["2.7"],

// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
Expand All @@ -41,7 +41,7 @@
"sqlalchemy": [],
"scipy": [],
"numexpr": [],
"tables": [],
"pytables": [],
"openpyxl": [],
"xlrd": [],
"xlwt": []
Expand Down
39 changes: 32 additions & 7 deletions asv_bench/benchmarks/binary_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,34 +203,59 @@ def time_series_timestamp_compare(self):

class timestamp_ops_diff1(object):
goal_time = 0.2
N = 1000000

def setup(self):
self.N = 1000000
self.s = Series(date_range('20010101', periods=self.N, freq='s'))
self.s = self.create()

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s'))

def time_timestamp_ops_diff1(self):
self.s.diff()

class timestamp_tz_ops_diff1(timestamp_ops_diff1):
N = 10000

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))

class timestamp_ops_diff2(object):
goal_time = 0.2
N = 1000000

def setup(self):
self.N = 1000000
self.s = Series(date_range('20010101', periods=self.N, freq='s'))
self.s = self.create()

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s'))

def time_timestamp_ops_diff2(self):
(self.s - self.s.shift())

class timestamp_tz_ops_diff2(timestamp_ops_diff2):
N = 10000

def create(self):
return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))

class timestamp_series_compare(object):
goal_time = 0.2
N = 1000000

def setup(self):
self.N = 1000000
self.halfway = ((self.N // 2) - 1)
self.s = Series(date_range('20010101', periods=self.N, freq='T'))
self.s = self.create()
self.ts = self.s[self.halfway]

def create(self):
return Series(date_range('20010101', periods=self.N, freq='T'))

def time_timestamp_series_compare(self):
(self.ts >= self.s)
(self.ts >= self.s)

class timestamp_tz_series_compare(timestamp_series_compare):
N = 10000

def create(self):
return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern'))
15 changes: 11 additions & 4 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1590,9 +1590,10 @@ dtypes
------

The main types stored in pandas objects are ``float``, ``int``, ``bool``,
``datetime64[ns]``, ``timedelta[ns]`` and ``object``. In addition these dtypes
have item sizes, e.g. ``int64`` and ``int32``. A convenient :attr:`~DataFrame.dtypes``
attribute for DataFrames returns a Series with the data type of each column.
``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, ``category`` (in >= 0.15.0), and ``object``. In addition these dtypes
have item sizes, e.g. ``int64`` and ``int32``. See :ref:`Series with TZ <timeseries.timezone_series>` for more detail on ``datetime64[ns, tz]`` dtypes.

A convenient :attr:`~DataFrame.dtypes` attribute for DataFrames returns a Series with the data type of each column.

.. ipython:: python

Expand Down Expand Up @@ -1814,8 +1815,14 @@ dtypes:
df['tdeltas'] = df.dates.diff()
df['uint64'] = np.arange(3, 6).astype('u8')
df['other_dates'] = pd.date_range('20130101', periods=3).values
df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')
df

And the dtypes

.. ipython:: python

df.dtypes

:meth:`~DataFrame.select_dtypes` has two parameters ``include`` and ``exclude`` that allow you to
say "give me the columns WITH these dtypes" (``include``) and/or "give the
Expand Down Expand Up @@ -1868,7 +1875,7 @@ All numpy dtypes are subclasses of ``numpy.generic``:

.. note::

Pandas also defines an additional ``category`` dtype, which is not integrated into the normal
Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal
numpy hierarchy and wont show up with the above function.

.. note::
Expand Down
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Highlights include:

- Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
- The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here <whatsnew_0170.api_breaking.sorting>`
- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here <whatsnew_0170.tz>`
- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
- The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
Expand Down
27 changes: 27 additions & 0 deletions doc/source/timeseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1745,3 +1745,30 @@ constructor as well as ``tz_localize``.

# tz_convert(None) is identical with tz_convert('UTC').tz_localize(None)
didx.tz_convert('UCT').tz_localize(None)

.. _timeseries.timezone_series:

TZ aware Dtypes
~~~~~~~~~~~~~~~

.. versionadded:: 0.17.0

``Series/DatetimeIndex`` with a timezone naive value are represented with a dtype of ``datetime64[ns]``.

.. ipython:: python

dr = pd.date_range('20130101',periods=3)
dr
s = Series(dr)
s

``Series/DatetimeIndex`` with a timezone aware value are represented with a dtype of ``datetime64[ns, tz]``.

.. ipython:: python

dr = pd.date_range('20130101',periods=3,tz='US/Eastern')
dr
s = Series(dr)
s

Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see the :ref:`docs <basics.dt_accessors>` as well.
56 changes: 56 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Highlights include:

- Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
- The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here <whatsnew_0170.api_breaking.sorting>`
- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here <whatsnew_0170.tz>`
- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
- The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
Expand Down Expand Up @@ -417,6 +418,58 @@ To keep the previous behaviour, you can use ``errors='ignore'``:
Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
has been deprecated in favor of ``errors='coerce'``.

.. _whatsnew_0170.tz:

Datetime with TZ
~~~~~~~~~~~~~~~~

We are adding an implementation that natively supports datetime with timezones. A ``Series`` or a ``DataFrame`` column previously
*could* be assigned a datetime with timezones, and would work as an ``object`` dtype. This had performance issues with a large
number rows. (:issue:`8260`, :issue:`10763`)

The new implementation allows for having a single-timezone across all rows, and operating on it in a performant manner.

.. ipython:: python

df = DataFrame({'A' : date_range('20130101',periods=3),
'B' : date_range('20130101',periods=3,tz='US/Eastern'),
'C' : date_range('20130101',periods=3,tz='CET')})
df
df.dtypes

.. ipython:: python

df.B
df.B.dt.tz_localize(None)

This uses a new-dtype representation as well, that is very similar in look-and-feel to its numpy cousin ``datetime64[ns]``

.. ipython:: python

df['B'].dtype
type(df['B']).dtype

.. note::

There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but
functionally these are the same.

.. code-block:: python

In [1]: pd.date_range('20130101',periods=3,tz='US/Eastern')
Out[1]: DatetimeIndex(['2013-01-01 00:00:00-05:00', '2013-01-02 00:00:00-05:00',
'2013-01-03 00:00:00-05:00'],
dtype='datetime64[ns]', freq='D', tz='US/Eastern')

In [2]: pd.date_range('20130101',periods=3,tz='US/Eastern').dtype
Out[2]: dtype('<M8[ns]')

.. ipython:: python

pd.date_range('20130101',periods=3,tz='US/Eastern')
pd.date_range('20130101',periods=3,tz='US/Eastern').dtype


.. _whatsnew_0170.api_breaking.convert_objects:

Changes to convert_objects
Expand Down Expand Up @@ -824,6 +877,9 @@ Bug Fixes
- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
- Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`)
- Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`)
- Bug in ``Series.dt`` ops in preserving meta-data (:issue:`10477`)
- Bug in preserving ``NaT`` when passed in an otherwise invalid ``to_datetime`` construction (:issue:`10477`)
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
- Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
- Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)
Expand Down
22 changes: 17 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
"""
from pandas.core.series import Series
from pandas.tools.tile import cut
from pandas.tseries.period import PeriodIndex
from pandas import Index, PeriodIndex, DatetimeIndex

name = getattr(values, 'name', None)
values = Series(values).values
Expand All @@ -225,11 +225,15 @@ def value_counts(values, sort=True, ascending=False, normalize=False,

dtype = values.dtype
is_period = com.is_period_arraylike(values)
is_datetimetz = com.is_datetimetz(values)

if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
if com.is_datetime_or_timedelta_dtype(dtype) or is_period or is_datetimetz:

if is_period:
values = PeriodIndex(values, name=name)
values = PeriodIndex(values)
elif is_datetimetz:
tz = getattr(values, 'tz', None)
values = DatetimeIndex(values).tz_localize(None)

values = values.view(np.int64)
keys, counts = htable.value_count_scalar64(values, dropna)
Expand All @@ -239,8 +243,14 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
msk = keys != iNaT
keys, counts = keys[msk], counts[msk]

# localize to the original tz if necessary
if is_datetimetz:
keys = DatetimeIndex(keys).tz_localize(tz)

# convert the keys back to the dtype we came in
keys = keys.astype(dtype)
else:
keys = keys.astype(dtype)


elif com.is_integer_dtype(dtype):
values = com._ensure_int64(values)
Expand All @@ -257,7 +267,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
keys = np.insert(keys, 0, np.NaN)
counts = np.insert(counts, 0, mask.sum())

result = Series(counts, index=com._values_from_object(keys), name=name)
if not isinstance(keys, Index):
keys = Index(keys)
result = Series(counts, index=keys, name=name)

if bins is not None:
# TODO: This next line should be more efficient
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,11 @@ def base(self):
""" return the base object if the memory of the underlying data is shared """
return self.values.base

@property
def _values(self):
""" the internal implementation """
return self.values

def max(self):
""" The maximum value of the object """
return nanops.nanmax(self.values)
Expand Down Expand Up @@ -397,6 +402,14 @@ def hasnans(self):
""" return if I have any nans; enables various perf speedups """
return com.isnull(self).any()

def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwds):
""" perform the reduction type operation if we can """
func = getattr(self,name,None)
if func is None:
raise TypeError("{klass} cannot perform the operation {op}".format(klass=self.__class__.__name__,op=name))
return func(**kwds)

def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None, dropna=True):
"""
Expand Down Expand Up @@ -586,7 +599,7 @@ def drop_duplicates(self, keep='first', inplace=False):
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
def duplicated(self, keep='first'):
keys = com._ensure_object(self.values)
keys = com._values_from_object(com._ensure_object(self.values))
duplicated = lib.duplicated(keys, keep=keep)
try:
return self._constructor(duplicated,
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
import pandas.core.common as com
from pandas.util.decorators import cache_readonly, deprecate_kwarg

from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCCategoricalIndex,
from pandas.core.common import (ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex,
isnull, notnull, is_dtype_equal,
is_categorical_dtype, is_integer_dtype, is_object_dtype,
_possibly_infer_to_datetimelike, get_dtype_kinds,
is_list_like, is_sequence, is_null_slice, is_bool,
_ensure_platform_int, _ensure_object, _ensure_int64,
_coerce_indexer_dtype, take_1d)
from pandas.core.dtypes import CategoricalDtype
from pandas.util.terminal import get_terminal_size
from pandas.core.config import get_option

Expand Down Expand Up @@ -85,7 +86,7 @@ def f(self, other):
def maybe_to_categorical(array):
""" coerce to a categorical if a series is given """
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array.values
return array._values
return array

_codes_doc = """The category codes of this categorical.
Expand Down Expand Up @@ -231,7 +232,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F

# we are either a Series or a CategoricalIndex
if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
values = values.values
values = values._values

if ordered is None:
ordered = values.ordered
Expand Down
Loading