Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement BaseOffset in tslibs.offsets #18016

Merged
merged 15 commits into from
Nov 8, 2017
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 174 additions & 5 deletions pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
cimport cython

import time
from cpython.datetime cimport time as dt_time
from cpython.datetime cimport timedelta, time as dt_time

from dateutil.relativedelta import relativedelta

import numpy as np
cimport numpy as np
Expand All @@ -13,9 +15,11 @@ np.import_array()

from util cimport is_string_object

from conversion cimport tz_convert_single
from pandas._libs.tslib import pydt_to_i8

from frequencies cimport get_freq_code
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update setup.py for this

from conversion cimport tz_convert_single

# ---------------------------------------------------------------------
# Constants

Expand Down Expand Up @@ -79,7 +83,6 @@ _offset_to_period_map = {

need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS']


for __prefix in need_suffix:
for _m in _MONTHS:
key = '%s-%s' % (__prefix, _m)
Expand All @@ -105,17 +108,38 @@ def as_datetime(obj):
return obj


def _is_normalized(dt):
cpdef bint _is_normalized(dt):
if (dt.hour != 0 or dt.minute != 0 or dt.second != 0 or
dt.microsecond != 0 or getattr(dt, 'nanosecond', 0) != 0):
return False
return True


def apply_index_wraps(func):
# Note: normally we would use `@functools.wraps(func)`, but this does
# not play nicely wtih cython class methods
def wrapper(self, other):
result = func(self, other)
if self.normalize:
result = result.to_period('D').to_timestamp()
return result

# do @functools.wraps(func) manually since it doesn't work on cdef funcs
wrapper.__name__ = func.__name__
wrapper.__doc__ = func.__doc__
try:
wrapper.__module__ = func.__module__
except AttributeError:
# AttributeError: 'method_descriptor' object has no
# attribute '__module__'
pass
return wrapper


# ---------------------------------------------------------------------
# Business Helpers

def _get_firstbday(wkday):
cpdef int _get_firstbday(int wkday):
"""
wkday is the result of monthrange(year, month)

Expand Down Expand Up @@ -194,6 +218,45 @@ def _validate_business_time(t_input):
else:
raise ValueError("time data must be string or datetime.time")


# ---------------------------------------------------------------------
# Constructor Helpers

_rd_kwds = set([
'years', 'months', 'weeks', 'days',
'year', 'month', 'week', 'day', 'weekday',
'hour', 'minute', 'second', 'microsecond',
'nanosecond', 'nanoseconds',
'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'])


def _determine_offset(kwds):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment this is a method of DateOffset that only gets called in __init__.

# timedelta is used for sub-daily plural offsets and all singular
# offsets relativedelta is used for plural offsets of daily length or
# more nanosecond(s) are handled by apply_wraps
kwds_no_nanos = dict(
(k, v) for k, v in kwds.items()
if k not in ('nanosecond', 'nanoseconds')
)
# TODO: Are nanosecond and nanoseconds allowed somewhere?

_kwds_use_relativedelta = ('years', 'months', 'weeks', 'days',
'year', 'month', 'week', 'day', 'weekday',
'hour', 'minute', 'second', 'microsecond')

use_relativedelta = False
if len(kwds_no_nanos) > 0:
if any(k in _kwds_use_relativedelta for k in kwds_no_nanos):
offset = relativedelta(**kwds_no_nanos)
use_relativedelta = True
else:
# sub-daily offset - use timedelta (tz-aware)
offset = timedelta(**kwds_no_nanos)
else:
offset = timedelta(1)
return offset, use_relativedelta


# ---------------------------------------------------------------------
# Mixins & Singletons

Expand All @@ -206,3 +269,109 @@ class ApplyTypeError(TypeError):
# TODO: unused. remove?
class CacheableOffset(object):
_cacheable = True


class BeginMixin(object):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BeginMixin and EndMixin are new, each only have the one method. At the moment these methods are in DateOffset, but they are only used by a small handful of FooBegin and BarEnd subclasses.

# helper for vectorized offsets

def _beg_apply_index(self, i, freq):
"""Offsets index to beginning of Period frequency"""

off = i.to_perioddelta('D')

base, mult = get_freq_code(freq)
base_period = i.to_period(base)
if self.n <= 0:
# when subtracting, dates on start roll to prior
roll = np.where(base_period.to_timestamp() == i - off,
self.n, self.n + 1)
else:
roll = self.n

base = (base_period + roll).to_timestamp()
return base + off


class EndMixin(object):
# helper for vectorized offsets

def _end_apply_index(self, i, freq):
"""Offsets index to end of Period frequency"""

off = i.to_perioddelta('D')

base, mult = get_freq_code(freq)
base_period = i.to_period(base)
if self.n > 0:
# when adding, dates on end roll to next
roll = np.where(base_period.to_timestamp(how='end') == i - off,
self.n, self.n - 1)
else:
roll = self.n

base = (base_period + roll).to_timestamp(how='end')
return base + off


# ---------------------------------------------------------------------
# Base Classes

class _BaseOffset(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you creating a base class here? what is the purpose?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IOW why not simply have 1 Base class (and not a _BaseOffset and a BaseOffset)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comments about remaining cython/pickle issues.

You're absolutely right that in its current form having two separate classes accomplishes nothing. The idea is that _BaseOffset should be a cdef class, while BaseOffset should be python class. (__rfoo__ methods do not play nicely with cython classes).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that is fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would probably leave this as a class for the moment. I am not convinced this actually needs to be a full c-extension class (e.g. its not like we are inheriting from a python c-class here). I don't see the benefit and it has added complexity.

Copy link
Member Author

@jbrockmendel jbrockmendel Oct 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main reason is to achieve immutability. That's the big roadblock between us and making __eq__, __ne__, __hash__ performance not-awful. (There's an issue somewhere about "scalar types immutable" or something like that)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

"""
Base class for DateOffset methods that are not overriden by subclasses
and will (after pickle errors are resolved) go into a cdef class.
"""
_typ = "dateoffset"
_normalize_cache = True
_cacheable = False

def __call__(self, other):
return self.apply(other)

def __mul__(self, someInt):
return self.__class__(n=someInt * self.n, normalize=self.normalize,
**self.kwds)

def __neg__(self):
# Note: we are defering directly to __mul__ instead of __rmul__, as
# that allows us to use methods that can go in a `cdef class`
return self * -1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the status quo __neg__ is defined as return self.__class__(-self.n, normalize=self.normalize, **self.kwds). By deferring to __mul__, we move away from the self.kwds pattern. Ditto for copy.


def copy(self):
# Note: we are defering directly to __mul__ instead of __rmul__, as
# that allows us to use methods that can go in a `cdef class`
return self * 1

# TODO: this is never true. fix it or get rid of it
def _should_cache(self):
return self.isAnchored() and self._cacheable

def __repr__(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side note, the repr is currently used for hashing, but instead should simply define __hash__ I think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__hash__ is defined using _params() which is the god-awful slow thing we need to get rid of.

className = getattr(self, '_outputName', type(self).__name__)

if abs(self.n) != 1:
plural = 's'
else:
plural = ''

n_str = ""
if self.n != 1:
n_str = "%s * " % self.n

out = '<%s' % n_str + className + plural + self._repr_attrs() + '>'
return out


class BaseOffset(_BaseOffset):
# Here we add __rfoo__ methods that don't play well with cdef classes
def __rmul__(self, someInt):
return self.__mul__(someInt)

def __radd__(self, other):
return self.__add__(other)

def __rsub__(self, other):
if getattr(other, '_typ', None) in ['datetimeindex', 'series']:
# i.e. isinstance(other, (ABCDatetimeIndex, ABCSeries))
return other - self
return -self + other
Loading