Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement BaseOffset in tslibs.offsets #18016

Merged
merged 15 commits into from
Nov 8, 2017
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 175 additions & 3 deletions pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
cimport cython

import time
from cpython.datetime cimport time as dt_time
from cpython.datetime cimport timedelta, time as dt_time

from dateutil.relativedelta import relativedelta

import numpy as np
cimport numpy as np
Expand All @@ -15,6 +17,7 @@ from util cimport is_string_object


from pandas._libs.tslib import pydt_to_i8, tz_convert_single
from frequencies cimport get_freq_code
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update setup.py for this


# ---------------------------------------------------------------------
# Constants
Expand Down Expand Up @@ -105,17 +108,38 @@ def as_datetime(obj):
return obj


def _is_normalized(dt):
cpdef bint _is_normalized(dt):
if (dt.hour != 0 or dt.minute != 0 or dt.second != 0 or
dt.microsecond != 0 or getattr(dt, 'nanosecond', 0) != 0):
return False
return True


def apply_index_wraps(func):
# Note: normally we would use `@functools.wraps(func)`, but this does
# not play nicely wtih cython class methods
def wrapper(self, other):
result = func(self, other)
if self.normalize:
result = result.to_period('D').to_timestamp()
return result

# do @functools.wraps(func) manually since it doesn't work on cdef funcs
wrapper.__name__ = func.__name__
wrapper.__doc__ = func.__doc__
try:
wrapper.__module__ = func.__module__
except AttributeError:
# AttributeError: 'method_descriptor' object has no
# attribute '__module__'
pass
return wrapper


# ---------------------------------------------------------------------
# Business Helpers

def _get_firstbday(wkday):
cpdef int _get_firstbday(int wkday):
"""
wkday is the result of monthrange(year, month)

Expand Down Expand Up @@ -194,6 +218,48 @@ def _validate_business_time(t_input):
else:
raise ValueError("time data must be string or datetime.time")


# ---------------------------------------------------------------------
# Constructor Helpers

_rd_kwds = set([
'years', 'months', 'weeks', 'days',
'year', 'month', 'week', 'day', 'weekday',
'hour', 'minute', 'second', 'microsecond',
'nanosecond', 'nanoseconds',
'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'
])


def _determine_offset(kwds):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment this is a method of DateOffset that only gets called in __init__.

# timedelta is used for sub-daily plural offsets and all singular
# offsets relativedelta is used for plural offsets of daily length or
# more nanosecond(s) are handled by apply_wraps
kwds_no_nanos = dict(
(k, v) for k, v in kwds.items()
if k not in ('nanosecond', 'nanoseconds')
)
# TODO: Are nanosecond and nanoseconds allowed somewhere?

_kwds_use_relativedelta = (
'years', 'months', 'weeks', 'days',
'year', 'month', 'week', 'day', 'weekday',
'hour', 'minute', 'second', 'microsecond'
)

use_relativedelta = False
if len(kwds_no_nanos) > 0:
if any(k in _kwds_use_relativedelta for k in kwds_no_nanos):
offset = relativedelta(**kwds_no_nanos)
use_relativedelta = True
else:
# sub-daily offset - use timedelta (tz-aware)
offset = timedelta(**kwds_no_nanos)
else:
offset = timedelta(1)
return offset, use_relativedelta


# ---------------------------------------------------------------------
# Mixins & Singletons

Expand All @@ -206,3 +272,109 @@ class ApplyTypeError(TypeError):
# TODO: unused. remove?
class CacheableOffset(object):
_cacheable = True


class BeginMixin(object):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BeginMixin and EndMixin are new, each only have the one method. At the moment these methods are in DateOffset, but they are only used by a small handful of FooBegin and BarEnd subclasses.

# helper for vectorized offsets

def _beg_apply_index(self, i, freq):
"""Offsets index to beginning of Period frequency"""

off = i.to_perioddelta('D')

base, mult = get_freq_code(freq)
base_period = i.to_period(base)
if self.n <= 0:
# when subtracting, dates on start roll to prior
roll = np.where(base_period.to_timestamp() == i - off,
self.n, self.n + 1)
else:
roll = self.n

base = (base_period + roll).to_timestamp()
return base + off


class EndMixin(object):
# helper for vectorized offsets

def _end_apply_index(self, i, freq):
"""Offsets index to end of Period frequency"""

off = i.to_perioddelta('D')

base, mult = get_freq_code(freq)
base_period = i.to_period(base)
if self.n > 0:
# when adding, dates on end roll to next
roll = np.where(base_period.to_timestamp(how='end') == i - off,
self.n, self.n - 1)
else:
roll = self.n

base = (base_period + roll).to_timestamp(how='end')
return base + off


# ---------------------------------------------------------------------
# Base Classes

class _BaseOffset(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you creating a base class here? what is the purpose?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IOW why not simply have 1 Base class (and not a _BaseOffset and a BaseOffset)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comments about remaining cython/pickle issues.

You're absolutely right that in its current form having two separate classes accomplishes nothing. The idea is that _BaseOffset should be a cdef class, while BaseOffset should be python class. (__rfoo__ methods do not play nicely with cython classes).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that is fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would probably leave this as a class for the moment. I am not convinced this actually needs to be a full c-extension class (e.g. its not like we are inheriting from a python c-class here). I don't see the benefit and it has added complexity.

Copy link
Member Author

@jbrockmendel jbrockmendel Oct 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main reason is to achieve immutability. That's the big roadblock between us and making __eq__, __ne__, __hash__ performance not-awful. (There's an issue somewhere about "scalar types immutable" or something like that)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

"""
Base class for DateOffset methods that are not overriden by subclasses
and will (after pickle errors are resolved) go into a cdef class.
"""
_typ = "dateoffset"
_normalize_cache = True
_cacheable = False

def __call__(self, other):
return self.apply(other)

def __mul__(self, someInt):
return self.__class__(n=someInt * self.n, normalize=self.normalize,
**self.kwds)

def __neg__(self):
# Note: we are defering directly to __mul__ instead of __rmul__, as
# that allows us to use methods that can go in a `cdef class`
return self * -1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the status quo __neg__ is defined as return self.__class__(-self.n, normalize=self.normalize, **self.kwds). By deferring to __mul__, we move away from the self.kwds pattern. Ditto for copy.


def copy(self):
# Note: we are defering directly to __mul__ instead of __rmul__, as
# that allows us to use methods that can go in a `cdef class`
return self * 1

# TODO: this is never true. fix it or get rid of it
def _should_cache(self):
return self.isAnchored() and self._cacheable

def __repr__(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side note, the repr is currently used for hashing, but instead should simply define __hash__ I think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__hash__ is defined using _params() which is the god-awful slow thing we need to get rid of.

className = getattr(self, '_outputName', type(self).__name__)

if abs(self.n) != 1:
plural = 's'
else:
plural = ''

n_str = ""
if self.n != 1:
n_str = "%s * " % self.n

out = '<%s' % n_str + className + plural + self._repr_attrs() + '>'
return out


class BaseOffset(_BaseOffset):
# Here we add __rfoo__ methods that don't play well with cdef classes
def __rmul__(self, someInt):
return self.__mul__(someInt)

def __radd__(self, other):
return self.__add__(other)

def __rsub__(self, other):
if getattr(other, '_typ', None) in ['datetimeindex', 'series']:
# i.e. isinstance(other, (ABCDatetimeIndex, ABCSeries))
return other - self
return -self + other
7 changes: 5 additions & 2 deletions pandas/tests/tseries/test_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
from pandas.tseries.holiday import USFederalHolidayCalendar


data_dir = tm.get_data_path()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this call to up here ensures that we get the same data_dir whether running the tests via pytest or interactively. Under the status quo, copy/pasting the pertinent test below will fail because get_data_path will not behave as expected.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

huh? we use this pattern everywhere, why are you changing this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because when I try to run these tests interactively and copy/paste the contents of a test function, tm.get_data_path returns unexpected results depending on os.getcwd(). AFAICT when run non-interactively it behaves as if cwd is pandas/tests/tseries.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean 'interactively'? you should simply be running

pytest pandas/tests/...... -k ... or whatever that is the idiomatic way to run tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a test fails and I want to figure out why, I run the contents of the test manually in the REPL.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to revert this change; not that big a deal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes pls revert.

standard way to run tests is

pytest path/to/test -k optional_regex

lots of options, including --pdb to drop into the debuger

pls revert this is non-standard



def test_monthrange():
import calendar
for y in range(2000, 2013):
Expand Down Expand Up @@ -480,7 +483,7 @@ def test_pickle_v0_15_2(self):
'Day': Day(1),
'YearBegin': YearBegin(1),
'Week': Week(1)}
pickle_path = os.path.join(tm.get_data_path(),
pickle_path = os.path.join(data_dir,
'dateoffset_0_15_2.pickle')
# This code was executed once on v0.15.2 to generate the pickle:
# with open(pickle_path, 'wb') as f: pickle.dump(offsets, f)
Expand Down Expand Up @@ -1884,7 +1887,7 @@ def _check_roundtrip(obj):
def test_pickle_compat_0_14_1(self):
hdays = [datetime(2013, 1, 1) for ele in range(4)]

pth = tm.get_data_path()
pth = data_dir

cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle'))
cday = CDay(holidays=hdays)
Expand Down
Loading