Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Validate AsOfJoin tolerance and attempt interval unit conversion #1952

Merged
merged 6 commits into from
Sep 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions ibis/expr/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,21 @@ class Interval(DataType):
ns='nanosecond',
)

_timedelta_to_interval_units = dict(
days='D',
hours='h',
minutes='m',
seconds='s',
milliseconds='ms',
microseconds='us',
nanoseconds='ns',
)

def _convert_timedelta_unit_to_interval_unit(self, unit: str):
if unit not in self._timedelta_to_interval_units:
raise ValueError
return self._timedelta_to_interval_units[unit]

def __init__(
self,
unit: str = 's',
Expand All @@ -414,7 +429,10 @@ def __init__(
) -> None:
super().__init__(nullable=nullable)
if unit not in self._units:
raise ValueError('Unsupported interval unit `{}`'.format(unit))
try:
unit = self._convert_timedelta_unit_to_interval_unit(unit)
except ValueError:
raise ValueError('Unsupported interval unit `{}`'.format(unit))

if value_type is None:
value_type = int32
Expand Down Expand Up @@ -1422,6 +1440,25 @@ def type(self) -> DataType:
validate_type = dtype


def _get_timedelta_units(timedelta: datetime.timedelta) -> List[str]:
# pandas Timedelta has more granularity
if hasattr(timedelta, 'components'):
unit_fields = timedelta.components._fields
base_object = timedelta.components
# datetime.timedelta only stores days, seconds, and microseconds internally
else:
unit_fields = ['days', 'seconds', 'microseconds']
base_object = timedelta

time_units = []
[
time_units.append(field)
for field in unit_fields
if getattr(base_object, field) > 0
]
return time_units


@dtype.register(object)
def default(value, **kwargs) -> DataType:
raise com.IbisTypeError('Value {!r} is not a valid datatype'.format(value))
Expand Down Expand Up @@ -1536,7 +1573,14 @@ def infer_timestamp(value: datetime.datetime) -> Timestamp:

@infer.register(datetime.timedelta)
def infer_interval(value: datetime.timedelta) -> Interval:
return interval
time_units = _get_timedelta_units(value)
# we can attempt a conversion in the simplest case, i.e. there is exactly
# one unit (e.g. pd.Timedelta('2 days') vs. pd.Timedelta('2 days 3 hours')
if len(time_units) == 1:
unit = time_units[0]
return Interval(unit)
else:
return interval


@infer.register(str)
Expand Down
8 changes: 8 additions & 0 deletions ibis/expr/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import itertools
import operator
from contextlib import suppress
from typing import List

import toolz

Expand Down Expand Up @@ -1732,6 +1733,13 @@ def __init__(self, left, right, predicates, by, tolerance):
super().__init__(left, right, predicates)
self.by = _clean_join_predicates(self.left, self.right, by)
self.tolerance = tolerance
self._validate_args(['by', 'tolerance'])

def _validate_args(self, args: List[str]):
for arg in args:
argument = self.signature[arg]
value = argument.validate(getattr(self, arg))
setattr(self, arg, value)


class Union(TableNode, HasSchema):
Expand Down
55 changes: 54 additions & 1 deletion ibis/expr/tests/test_datatypes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import datetime
from collections import OrderedDict

import pandas as pd
import pytest
import pytz
from multipledispatch.conflict import ambiguities
from pytest import param

import ibis
import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -367,7 +369,13 @@ def test_time_valid():
('foo', dt.string),
(datetime.date.today(), dt.date),
(datetime.datetime.now(), dt.timestamp),
(datetime.timedelta(days=3), dt.interval),
(datetime.timedelta(days=3), dt.Interval(unit='D')),
(pd.Timedelta('5 hours'), dt.Interval(unit='h')),
(pd.Timedelta('7 minutes'), dt.Interval(unit='m')),
(datetime.timedelta(seconds=9), dt.Interval(unit='s')),
(pd.Timedelta('11 milliseconds'), dt.Interval(unit='ms')),
(datetime.timedelta(microseconds=15), dt.Interval(unit='us')),
(pd.Timedelta('17 nanoseconds'), dt.Interval(unit='ns')),
# numeric types
(5, dt.int8),
(5, dt.int8),
Expand Down Expand Up @@ -417,6 +425,51 @@ def test_time_valid():
]
),
),
param(
datetime.timedelta(hours=5),
dt.Interval(unit='h'),
id='dateime hours',
marks=pytest.mark.xfail(
reason='Hour conversion from datetime.timedelta to ibis '
'interval not supported'
),
),
param(
datetime.timedelta(minutes=7),
dt.Interval(unit='m'),
id='dateime minutes',
marks=pytest.mark.xfail(
reason='Minute conversion from datetime.timedelta to ibis '
'interval not supported'
),
),
param(
datetime.timedelta(milliseconds=11),
dt.Interval(unit='ms'),
id='dateime milliseconds',
marks=pytest.mark.xfail(
reason='Millisecond conversion from datetime.timedelta to '
'ibis interval not supported'
),
),
param(
pd.Timedelta('3', unit='W'),
dt.Interval(unit='W'),
id='weeks',
marks=pytest.mark.xfail(
reason='Week conversion from Timedelta to ibis interval '
'not supported'
),
),
param(
pd.Timedelta('3', unit='Y'),
dt.Interval(unit='Y'),
id='years',
marks=pytest.mark.xfail(
reason='Year conversion from Timedelta to ibis interval '
'not supported'
),
),
],
)
def test_infer_dtype(value, expected_dtype):
Expand Down
38 changes: 38 additions & 0 deletions ibis/expr/tests/test_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import datetime
import pickle
import re

import pandas as pd
import pytest

import ibis
Expand Down Expand Up @@ -695,6 +697,42 @@ def test_asof_join_with_by():
assert by.left.op().name == by.right.op().name == 'key'


@pytest.mark.parametrize(
('ibis_interval', 'timedelta_interval'),
[
[ibis.interval(days=2), pd.Timedelta('2 days')],
[ibis.interval(days=2), datetime.timedelta(days=2)],
[ibis.interval(hours=5), pd.Timedelta('5 hours')],
[ibis.interval(hours=5), datetime.timedelta(hours=5)],
[ibis.interval(minutes=7), pd.Timedelta('7 minutes')],
[ibis.interval(minutes=7), datetime.timedelta(minutes=7)],
[ibis.interval(seconds=9), pd.Timedelta('9 seconds')],
[ibis.interval(seconds=9), datetime.timedelta(seconds=9)],
[ibis.interval(milliseconds=11), pd.Timedelta('11 milliseconds')],
[ibis.interval(milliseconds=11), datetime.timedelta(milliseconds=11)],
[ibis.interval(microseconds=15), pd.Timedelta('15 microseconds')],
[ibis.interval(microseconds=15), datetime.timedelta(microseconds=15)],
[ibis.interval(nanoseconds=17), pd.Timedelta('17 nanoseconds')],
],
)
def test_asof_join_with_tolerance(ibis_interval, timedelta_interval):
emilyreff7 marked this conversation as resolved.
Show resolved Hide resolved
left = ibis.table(
[('time', 'int32'), ('key', 'int32'), ('value', 'double')]
)
right = ibis.table(
[('time', 'int32'), ('key', 'int32'), ('value2', 'double')]
)

joined = api.asof_join(left, right, 'time', tolerance=ibis_interval)
tolerance = joined.op().tolerance
assert_equal(tolerance, ibis_interval)

joined = api.asof_join(left, right, 'time', tolerance=timedelta_interval)
tolerance = joined.op().tolerance
assert isinstance(tolerance, ir.IntervalScalar)
assert isinstance(tolerance.op(), ops.Literal)


def test_equijoin_schema_merge():
table1 = ibis.table([('key1', 'string'), ('value1', 'double')])
table2 = ibis.table([('key2', 'string'), ('stuff', 'int32')])
Expand Down