From 97fa82628272db95dfdec36f54fc7d25c353d0a3 Mon Sep 17 00:00:00 2001 From: Emily Reff Date: Fri, 6 Sep 2019 14:44:45 -0400 Subject: [PATCH 1/5] validate AsOfJoin tolerance and attempt interval unit conversion --- ibis/expr/datatypes.py | 64 +++++++++++++++++++++++++++++------ ibis/expr/operations.py | 17 ++++++++-- ibis/expr/tests/test_table.py | 57 ++++++++++++++++++++++++++++++- 3 files changed, 124 insertions(+), 14 deletions(-) diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py index 739b1d63b468..23fe88905eaa 100644 --- a/ibis/expr/datatypes.py +++ b/ibis/expr/datatypes.py @@ -32,6 +32,7 @@ try: if sys.version_info >= (3, 6): import shapely.geometry + IS_SHAPELY_AVAILABLE = True except ImportError: ... @@ -405,6 +406,21 @@ class Interval(DataType): ns='nanosecond', ) + _timedelta_to_interval_units = dict( + days='D', + hours='h', + minutes='m', + seconds='s', + milliseconds='ms', + microseconds='us', + nanoseconds='ns', + ) + + def _convert_timedelta_unit_to_interval_unit(self, unit: str): + if unit not in self._timedelta_to_interval_units: + raise ValueError + return self._timedelta_to_interval_units[unit] + def __init__( self, unit: str = 's', @@ -413,7 +429,10 @@ def __init__( ) -> None: super().__init__(nullable=nullable) if unit not in self._units: - raise ValueError('Unsupported interval unit `{}`'.format(unit)) + try: + unit = self._convert_timedelta_unit_to_interval_unit(unit) + except ValueError: + raise ValueError('Unsupported interval unit `{}`'.format(unit)) if value_type is None: value_type = int32 @@ -536,11 +555,10 @@ def _literal_value_hash_key(self, value): def _tuplize(values): """Recursively convert `values` to a tuple of tuples.""" + def tuplize_iter(values): yield from ( - tuple(tuplize_iter(value)) - if util.is_iterable(value) - else value + tuple(tuplize_iter(value)) if util.is_iterable(value) else value for value in values ) @@ -665,7 +683,7 @@ def _literal_value_hash_key(self, value): shapely.geometry.Polygon, shapely.geometry.MultiLineString, shapely.geometry.MultiPoint, - shapely.geometry.MultiPolygon + shapely.geometry.MultiPolygon, ) if isinstance(value, geo_shapes): return self, value.wkt @@ -1422,6 +1440,17 @@ def type(self) -> DataType: validate_type = dtype +def _get_timedelta_units(timedelta: datetime.timedelta) -> List[str]: + unit_fields = timedelta.components._fields + time_units = [] + [ + time_units.append(field) + for field in unit_fields + if getattr(timedelta.components, field) > 0 + ] + return time_units + + @dtype.register(object) def default(value, **kwargs) -> DataType: raise com.IbisTypeError('Value {!r} is not a valid datatype'.format(value)) @@ -1536,7 +1565,14 @@ def infer_timestamp(value: datetime.datetime) -> Timestamp: @infer.register(datetime.timedelta) def infer_interval(value: datetime.timedelta) -> Interval: - return interval + time_units = _get_timedelta_units(value) + # we can attempt a conversion in the simplest case, i.e. there is exactly + # one unit (e.g. pd.Timedelta('2 days') vs. pd.Timedelta('2 days 3 hours') + if len(time_units) == 1: + unit = time_units[0] + return Interval(unit) + else: + return interval @infer.register(str) @@ -1572,13 +1608,14 @@ def infer_null(value: Optional[Null]) -> Null: if IS_SHAPELY_AVAILABLE: + @infer.register(shapely.geometry.Point) def infer_shapely_point(value: shapely.geometry.Point) -> Point: return point @infer.register(shapely.geometry.LineString) def infer_shapely_linestring( - value: shapely.geometry.LineString + value: shapely.geometry.LineString, ) -> LineString: return linestring @@ -1588,19 +1625,19 @@ def infer_shapely_polygon(value: shapely.geometry.Polygon) -> Polygon: @infer.register(shapely.geometry.MultiLineString) def infer_shapely_multilinestring( - value: shapely.geometry.MultiLineString + value: shapely.geometry.MultiLineString, ) -> MultiLineString: return multilinestring @infer.register(shapely.geometry.MultiPoint) def infer_shapely_multipoint( - value: shapely.geometry.MultiPoint + value: shapely.geometry.MultiPoint, ) -> MultiPoint: return multipoint @infer.register(shapely.geometry.MultiPolygon) def infer_shapely_multipolygon( - value: shapely.geometry.MultiPolygon + value: shapely.geometry.MultiPolygon, ) -> MultiPolygon: return multipolygon @@ -1721,7 +1758,12 @@ def can_cast_variadic( # geo spatial data type # cast between same type, used to cast from/to geometry and geography GEO_TYPES = ( - Point, LineString, Polygon, MultiLineString, MultiPoint, MultiPolygon + Point, + LineString, + Polygon, + MultiLineString, + MultiPoint, + MultiPolygon, ) diff --git a/ibis/expr/operations.py b/ibis/expr/operations.py index d86ca2b73276..0cc1f33392c7 100644 --- a/ibis/expr/operations.py +++ b/ibis/expr/operations.py @@ -3,6 +3,7 @@ import itertools import operator from contextlib import suppress +from typing import List import toolz @@ -1026,8 +1027,10 @@ def __init__(self, expr, window): window = window.bind(table) if window.max_lookback is not None: - error_msg = ("'max lookback' windows must be ordered " - "by a timestamp column") + error_msg = ( + "'max lookback' windows must be ordered " + "by a timestamp column" + ) if len(window._order_by) != 1: raise com.IbisInputError(error_msg) order_var = window._order_by[0].op().args[0] @@ -1730,6 +1733,13 @@ def __init__(self, left, right, predicates, by, tolerance): super().__init__(left, right, predicates) self.by = _clean_join_predicates(self.left, self.right, by) self.tolerance = tolerance + self._validate_args(['by', 'tolerance']) + + def _validate_args(self, args: List[str]): + for arg in args: + argument = self.signature[arg] + value = argument.validate(getattr(self, arg)) + setattr(self, arg, value) class Union(TableNode, HasSchema): @@ -3196,6 +3206,7 @@ class GeoSRID(GeoSpatialUnOp): class GeoSetSRID(GeoSpatialUnOp): """Set the spatial reference identifier for the ST_Geometry.""" + srid = Arg(rlz.integer) output_type = rlz.shape_like('args', dt.geometry) @@ -3221,6 +3232,7 @@ class GeoDFullyWithin(GeoSpatialBinOp): """Returns True if the geometries are fully within the specified distance of one another. """ + distance = Arg(rlz.floating) output_type = rlz.shape_like('args', dt.boolean) @@ -3230,6 +3242,7 @@ class GeoDWithin(GeoSpatialBinOp): """Returns True if the geometries are within the specified distance of one another. """ + distance = Arg(rlz.floating) output_type = rlz.shape_like('args', dt.boolean) diff --git a/ibis/expr/tests/test_table.py b/ibis/expr/tests/test_table.py index a70640b2ad0d..1a48ff859099 100644 --- a/ibis/expr/tests/test_table.py +++ b/ibis/expr/tests/test_table.py @@ -1,7 +1,9 @@ import pickle import re +import pandas as pd import pytest +from pytest import param import ibis import ibis.common.exceptions as com @@ -695,6 +697,59 @@ def test_asof_join_with_by(): assert by.left.op().name == by.right.op().name == 'key' +@pytest.mark.parametrize( + ('ibis_interval', 'timedelta_interval'), + [ + [ibis.interval(days=2), pd.Timedelta('2 days')], + [ibis.interval(hours=5), pd.Timedelta('5 hours')], + [ibis.interval(minutes=7), pd.Timedelta('7 minutes')], + [ibis.interval(seconds=9), pd.Timedelta('7 seconds')], + [ibis.interval(milliseconds=9), pd.Timedelta('9 milliseconds')], + [ibis.interval(microseconds=11), pd.Timedelta('11 microseconds')], + [ibis.interval(nanoseconds=17), pd.Timedelta('17 nanoseconds')], + param( + ibis.interval(weeks=3), + pd.Timedelta('3 W'), + id='weeks', + marks=pytest.mark.xfail( + reason='Week conversion from Timedelta to ibis interval ' + 'not supported' + ), + ), + param( + ibis.interval(years=3), + pd.Timedelta('3 Y'), + id='years', + marks=pytest.mark.xfail( + reason='Year conversion from Timedelta to ibis interval ' + 'not supported' + ), + ), + ], +) +def test_asof_join_with_tolerance(ibis_interval, timedelta_interval): + left = ibis.table( + [('time', 'int32'), ('key', 'int32'), ('value', 'double')] + ) + right = ibis.table( + [('time', 'int32'), ('key', 'int32'), ('value2', 'double')] + ) + + joined = api.asof_join(left, right, 'time', tolerance=ibis_interval) + tolerance = joined.op().tolerance + assert_equal(tolerance, ibis_interval) + + joined = api.asof_join(left, right, 'time', tolerance=timedelta_interval) + tolerance = joined.op().tolerance + + assert isinstance(tolerance, ir.IntervalScalar) + assert isinstance(tolerance.op(), ops.Literal) + + ibis_interval_unit = ibis_interval.op().dtype.unit + timedelta_unit = tolerance.op().dtype.unit + assert timedelta_unit == ibis_interval_unit + + def test_equijoin_schema_merge(): table1 = ibis.table([('key1', 'string'), ('value1', 'double')]) table2 = ibis.table([('key2', 'string'), ('stuff', 'int32')]) @@ -1064,7 +1119,7 @@ def test_cannot_use_existence_expression_in_join(table): def test_not_exists_predicate(t1, t2): - cond = -(t1.key1 == t2.key1).any() + cond = -((t1.key1 == t2.key1).any()) assert isinstance(cond.op(), ops.NotAny) From c524401d6f1f24f818bda2b9e39ba8978fc66552 Mon Sep 17 00:00:00 2001 From: Emily Reff Date: Mon, 9 Sep 2019 12:10:32 -0400 Subject: [PATCH 2/5] fix failing tests --- ibis/expr/datatypes.py | 12 ++++++++++-- ibis/expr/tests/test_table.py | 37 ++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py index 23fe88905eaa..599a04d6d539 100644 --- a/ibis/expr/datatypes.py +++ b/ibis/expr/datatypes.py @@ -1441,12 +1441,20 @@ def type(self) -> DataType: def _get_timedelta_units(timedelta: datetime.timedelta) -> List[str]: - unit_fields = timedelta.components._fields + # pandas Timedelta has more granularity + if hasattr(timedelta, 'components'): + unit_fields = timedelta.components._fields + base_object = timedelta.components + # datetime.timedelta only stores days, seconds, and microseconds internally + else: + unit_fields = ['days', 'seconds', 'microseconds'] + base_object = timedelta + time_units = [] [ time_units.append(field) for field in unit_fields - if getattr(timedelta.components, field) > 0 + if getattr(base_object, field) > 0 ] return time_units diff --git a/ibis/expr/tests/test_table.py b/ibis/expr/tests/test_table.py index 1a48ff859099..0ba86b6bcccf 100644 --- a/ibis/expr/tests/test_table.py +++ b/ibis/expr/tests/test_table.py @@ -1,3 +1,4 @@ +import datetime import pickle import re @@ -701,15 +702,45 @@ def test_asof_join_with_by(): ('ibis_interval', 'timedelta_interval'), [ [ibis.interval(days=2), pd.Timedelta('2 days')], + [ibis.interval(days=2), datetime.timedelta(days=2)], [ibis.interval(hours=5), pd.Timedelta('5 hours')], [ibis.interval(minutes=7), pd.Timedelta('7 minutes')], [ibis.interval(seconds=9), pd.Timedelta('7 seconds')], - [ibis.interval(milliseconds=9), pd.Timedelta('9 milliseconds')], - [ibis.interval(microseconds=11), pd.Timedelta('11 microseconds')], + [ibis.interval(seconds=9), datetime.timedelta(seconds=9)], + [ibis.interval(milliseconds=11), pd.Timedelta('11 milliseconds')], + [ibis.interval(microseconds=15), pd.Timedelta('15 microseconds')], + [ibis.interval(microseconds=15), datetime.timedelta(microseconds=15)], [ibis.interval(nanoseconds=17), pd.Timedelta('17 nanoseconds')], + param( + ibis.interval(hours=5), + datetime.timedelta(hours=5), + id='dateime hours', + marks=pytest.mark.xfail( + reason='Hour conversion from datetime.timedelta to ibis ' + 'interval not supported' + ), + ), + param( + ibis.interval(minutes=7), + datetime.timedelta(minutes=7), + id='dateime minutes', + marks=pytest.mark.xfail( + reason='Minute conversion from datetime.timedelta to ibis ' + 'interval not supported' + ), + ), + param( + ibis.interval(milliseconds=11), + datetime.timedelta(milliseconds=11), + id='dateime milliseconds', + marks=pytest.mark.xfail( + reason='Millisecond conversion from datetime.timedelta to ' + 'ibis interval not supported' + ), + ), param( ibis.interval(weeks=3), - pd.Timedelta('3 W'), + pd.Timedelta('3', unit='W'), id='weeks', marks=pytest.mark.xfail( reason='Week conversion from Timedelta to ibis interval ' From 91318be8e168849b825b19e35f60790ed773b783 Mon Sep 17 00:00:00 2001 From: Emily Reff Date: Mon, 9 Sep 2019 13:39:28 -0400 Subject: [PATCH 3/5] fix failing tests --- ibis/expr/tests/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ibis/expr/tests/test_table.py b/ibis/expr/tests/test_table.py index 0ba86b6bcccf..d2e4e8e00174 100644 --- a/ibis/expr/tests/test_table.py +++ b/ibis/expr/tests/test_table.py @@ -749,7 +749,7 @@ def test_asof_join_with_by(): ), param( ibis.interval(years=3), - pd.Timedelta('3 Y'), + pd.Timedelta('3', unit='Y'), id='years', marks=pytest.mark.xfail( reason='Year conversion from Timedelta to ibis interval ' From 134caf3a235af7c68d81d35a24a9d83f504cd0ec Mon Sep 17 00:00:00 2001 From: Emily Reff Date: Mon, 9 Sep 2019 16:44:44 -0400 Subject: [PATCH 4/5] fix failing tests --- ibis/expr/tests/test_datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ibis/expr/tests/test_datatypes.py b/ibis/expr/tests/test_datatypes.py index e0d1dadd8bb3..9c6c9613411a 100644 --- a/ibis/expr/tests/test_datatypes.py +++ b/ibis/expr/tests/test_datatypes.py @@ -367,7 +367,7 @@ def test_time_valid(): ('foo', dt.string), (datetime.date.today(), dt.date), (datetime.datetime.now(), dt.timestamp), - (datetime.timedelta(days=3), dt.interval), + (datetime.timedelta(days=3), dt.Interval(unit='D')), # numeric types (5, dt.int8), (5, dt.int8), From 40436e10ba4b20011b256abb39ceb1080e70eb3b Mon Sep 17 00:00:00 2001 From: Emily Reff Date: Wed, 11 Sep 2019 16:20:30 -0400 Subject: [PATCH 5/5] break up tests --- ibis/expr/tests/test_datatypes.py | 53 +++++++++++++++++++++++++++++ ibis/expr/tests/test_table.py | 56 +++---------------------------- 2 files changed, 57 insertions(+), 52 deletions(-) diff --git a/ibis/expr/tests/test_datatypes.py b/ibis/expr/tests/test_datatypes.py index 9c6c9613411a..16ee60fe407f 100644 --- a/ibis/expr/tests/test_datatypes.py +++ b/ibis/expr/tests/test_datatypes.py @@ -1,9 +1,11 @@ import datetime from collections import OrderedDict +import pandas as pd import pytest import pytz from multipledispatch.conflict import ambiguities +from pytest import param import ibis import ibis.expr.datatypes as dt @@ -368,6 +370,12 @@ def test_time_valid(): (datetime.date.today(), dt.date), (datetime.datetime.now(), dt.timestamp), (datetime.timedelta(days=3), dt.Interval(unit='D')), + (pd.Timedelta('5 hours'), dt.Interval(unit='h')), + (pd.Timedelta('7 minutes'), dt.Interval(unit='m')), + (datetime.timedelta(seconds=9), dt.Interval(unit='s')), + (pd.Timedelta('11 milliseconds'), dt.Interval(unit='ms')), + (datetime.timedelta(microseconds=15), dt.Interval(unit='us')), + (pd.Timedelta('17 nanoseconds'), dt.Interval(unit='ns')), # numeric types (5, dt.int8), (5, dt.int8), @@ -417,6 +425,51 @@ def test_time_valid(): ] ), ), + param( + datetime.timedelta(hours=5), + dt.Interval(unit='h'), + id='dateime hours', + marks=pytest.mark.xfail( + reason='Hour conversion from datetime.timedelta to ibis ' + 'interval not supported' + ), + ), + param( + datetime.timedelta(minutes=7), + dt.Interval(unit='m'), + id='dateime minutes', + marks=pytest.mark.xfail( + reason='Minute conversion from datetime.timedelta to ibis ' + 'interval not supported' + ), + ), + param( + datetime.timedelta(milliseconds=11), + dt.Interval(unit='ms'), + id='dateime milliseconds', + marks=pytest.mark.xfail( + reason='Millisecond conversion from datetime.timedelta to ' + 'ibis interval not supported' + ), + ), + param( + pd.Timedelta('3', unit='W'), + dt.Interval(unit='W'), + id='weeks', + marks=pytest.mark.xfail( + reason='Week conversion from Timedelta to ibis interval ' + 'not supported' + ), + ), + param( + pd.Timedelta('3', unit='Y'), + dt.Interval(unit='Y'), + id='years', + marks=pytest.mark.xfail( + reason='Year conversion from Timedelta to ibis interval ' + 'not supported' + ), + ), ], ) def test_infer_dtype(value, expected_dtype): diff --git a/ibis/expr/tests/test_table.py b/ibis/expr/tests/test_table.py index d2e4e8e00174..e8950fac00d6 100644 --- a/ibis/expr/tests/test_table.py +++ b/ibis/expr/tests/test_table.py @@ -4,7 +4,6 @@ import pandas as pd import pytest -from pytest import param import ibis import ibis.common.exceptions as com @@ -704,58 +703,16 @@ def test_asof_join_with_by(): [ibis.interval(days=2), pd.Timedelta('2 days')], [ibis.interval(days=2), datetime.timedelta(days=2)], [ibis.interval(hours=5), pd.Timedelta('5 hours')], + [ibis.interval(hours=5), datetime.timedelta(hours=5)], [ibis.interval(minutes=7), pd.Timedelta('7 minutes')], - [ibis.interval(seconds=9), pd.Timedelta('7 seconds')], + [ibis.interval(minutes=7), datetime.timedelta(minutes=7)], + [ibis.interval(seconds=9), pd.Timedelta('9 seconds')], [ibis.interval(seconds=9), datetime.timedelta(seconds=9)], [ibis.interval(milliseconds=11), pd.Timedelta('11 milliseconds')], + [ibis.interval(milliseconds=11), datetime.timedelta(milliseconds=11)], [ibis.interval(microseconds=15), pd.Timedelta('15 microseconds')], [ibis.interval(microseconds=15), datetime.timedelta(microseconds=15)], [ibis.interval(nanoseconds=17), pd.Timedelta('17 nanoseconds')], - param( - ibis.interval(hours=5), - datetime.timedelta(hours=5), - id='dateime hours', - marks=pytest.mark.xfail( - reason='Hour conversion from datetime.timedelta to ibis ' - 'interval not supported' - ), - ), - param( - ibis.interval(minutes=7), - datetime.timedelta(minutes=7), - id='dateime minutes', - marks=pytest.mark.xfail( - reason='Minute conversion from datetime.timedelta to ibis ' - 'interval not supported' - ), - ), - param( - ibis.interval(milliseconds=11), - datetime.timedelta(milliseconds=11), - id='dateime milliseconds', - marks=pytest.mark.xfail( - reason='Millisecond conversion from datetime.timedelta to ' - 'ibis interval not supported' - ), - ), - param( - ibis.interval(weeks=3), - pd.Timedelta('3', unit='W'), - id='weeks', - marks=pytest.mark.xfail( - reason='Week conversion from Timedelta to ibis interval ' - 'not supported' - ), - ), - param( - ibis.interval(years=3), - pd.Timedelta('3', unit='Y'), - id='years', - marks=pytest.mark.xfail( - reason='Year conversion from Timedelta to ibis interval ' - 'not supported' - ), - ), ], ) def test_asof_join_with_tolerance(ibis_interval, timedelta_interval): @@ -772,14 +729,9 @@ def test_asof_join_with_tolerance(ibis_interval, timedelta_interval): joined = api.asof_join(left, right, 'time', tolerance=timedelta_interval) tolerance = joined.op().tolerance - assert isinstance(tolerance, ir.IntervalScalar) assert isinstance(tolerance.op(), ops.Literal) - ibis_interval_unit = ibis_interval.op().dtype.unit - timedelta_unit = tolerance.op().dtype.unit - assert timedelta_unit == ibis_interval_unit - def test_equijoin_schema_merge(): table1 = ibis.table([('key1', 'string'), ('value1', 'double')])