diff --git a/CHANGELOG.md b/CHANGELOG.md index 505e1b093..917b24aae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Features +- Added `EventSet.tick_calendar()` operator. - Added `EventSet.where()` operator. - Add `filter_moving_count` operator. diff --git a/docs/src/reference/index.md b/docs/src/reference/index.md index 174ffaa55..af6dca0ec 100644 --- a/docs/src/reference/index.md +++ b/docs/src/reference/index.md @@ -63,6 +63,7 @@ Check the index on the left for a more detailed description of any symbol. | [`EventSet.set_index()`][temporian.EventSet.set_index] | Replaces the indexes in an [`EventSet`][temporian.EventSet]. | | [`EventSet.since_last()`][temporian.EventSet.since_last] | Computes the amount of time since the last distinct timestamp. | | [`EventSet.tick()`][temporian.EventSet.tick] | Generates timestamps at regular intervals in the range of a guide. | +| [`EventSet.tick_calendar()`][temporian.EventSet.tick] | Generates timestamps at the specified calendar date-time events. | | [`EventSet.timestamps()`][temporian.EventSet.timestamps] | Creates a feature from the events timestamps (`float64`). | | [`EventSet.unique_timestamps()`][temporian.EventSet.unique_timestamps] | Removes events with duplicated timestamps from an [`EventSet`][temporian.EventSet]. | | [`EventSet.until_next()`][temporian.EventSet.until_next] | Duration until the next sampling event. | diff --git a/docs/src/reference/temporian/operators/tick_calendar.md b/docs/src/reference/temporian/operators/tick_calendar.md new file mode 100644 index 000000000..598fe2f33 --- /dev/null +++ b/docs/src/reference/temporian/operators/tick_calendar.md @@ -0,0 +1 @@ +::: temporian.EventSet.tick_calendar diff --git a/temporian/core/event_set_ops.py b/temporian/core/event_set_ops.py index 7f57e766c..46a9f5783 100644 --- a/temporian/core/event_set_ops.py +++ b/temporian/core/event_set_ops.py @@ -15,7 +15,7 @@ # pylint: disable=import-outside-toplevel from __future__ import annotations -from typing import Any, Dict, List, Optional, Union, TYPE_CHECKING +from typing import Any, Dict, List, Literal, Optional, Union, TYPE_CHECKING from temporian.core.data.duration import Duration @@ -2423,6 +2423,146 @@ def tick( return tick(self, interval=interval, align=align) + def tick_calendar( + self: EventSetOrNode, + second: Optional[Union[int, Literal["*"]]] = None, + minute: Optional[Union[int, Literal["*"]]] = None, + hour: Optional[Union[int, Literal["*"]]] = None, + mday: Optional[Union[int, Literal["*"]]] = None, + month: Optional[Union[int, Literal["*"]]] = None, + wday: Optional[Union[int, Literal["*"]]] = None, + ) -> EventSetOrNode: + """Generates events periodically at fixed times or dates e.g. each month. + + Events are generated in the range of the input + [`EventSet`][temporian.EventSet] independently for each index. + + The usability is inspired in the crontab format, where arguments can + take a value of `'*'` to tick at all values, or a fixed integer to + tick only at that precise value. + + Non-specified values (`None`), are set to `'*'` if a finer + resolution argument is specified, or fixed to the first valid value if + a lower resolution is specified. For example, setting only + `tick_calendar(hour='*')` + is equivalent to: + `tick_calendar(second=0, minute=0, hour='*', mday='*', month='*')` + , resulting in one tick at every exact hour of every day/month/year in + the input guide range. + + The datetime timezone is always assumed to be UTC. + + Examples: + ```python + >>> # Every day (at 00:00:00) in the period (exactly one year) + >>> a = tp.event_set(timestamps=["2021-01-01", "2021-12-31 23:59:59"]) + >>> b = a.tick_calendar(hour=0) + >>> b + indexes: ... + events: + (365 events): + timestamps: [...] + ... + + + >>> # Every day at 2:30am + >>> b = a.tick_calendar(hour=2, minute=30) + >>> tp.glue(b.calendar_hour(), b.calendar_minute()) + indexes: ... + events: + (365 events): + timestamps: [...] + 'calendar_hour': [2 2 2 ... 2 2 2] + 'calendar_minute': [30 30 30 ... 30 30 30] + ... + + + >>> # Day 5 of every month (at 00:00) + >>> b = a.tick_calendar(mday=5) + >>> b.calendar_day_of_month() + indexes: ... + events: + (12 events): + timestamps: [...] + 'calendar_day_of_month': [5 5 5 ... 5 5 5] + ... + + + >>> # 1st of February of every year + >>> a = tp.event_set(timestamps=["2020-01-01", "2021-12-31"]) + >>> b = a.tick_calendar(month=2) + >>> tp.glue(b.calendar_day_of_month(), b.calendar_month()) + indexes: ... + events: + (2 events): + timestamps: [...] + 'calendar_day_of_month': [1 1] + 'calendar_month': [2 2] + ... + + >>> # Every second in the period (2 hours -> 7200 seconds) + >>> a = tp.event_set(timestamps=["2020-01-01 00:00:00", + ... "2020-01-01 01:59:59"]) + >>> b = a.tick_calendar(second='*') + >>> b + indexes: ... + events: + (7200 events): + timestamps: [...] + ... + + >>> # Every second of the minute 30 of every hour (00:30 and 01:30) + >>> a = tp.event_set(timestamps=["2020-01-01 00:00", + ... "2020-01-01 02:00"]) + >>> b = a.tick_calendar(second='*', minute=30) + >>> b + indexes: ... + events: + (120 events): + timestamps: [...] + ... + + >>> # Not allowed: intermediate arguments (minute, hour) not specified + >>> b = a.tick_calendar(second=1, mday=1) # ambiguous meaning + Traceback (most recent call last): + ... + ValueError: Can't set argument to None because previous and + following arguments were specified. Set to '*' or an integer ... + + ``` + + Args: + second: '*' (any second), None (auto) or number in range `[0-59]` + to tick at specific second of each minute. + minute: '*' (any minute), None (auto) or number in range `[0-59]` + to tick at specific minute of each hour. + hour: '*' (any hour), None (auto), or number in range `[0-23]` to + tick at specific hour of each day. + mday: '*' (any day), None (auto) or number in range `[1-31]` + to tick at specific day of each month. Note that months + without some particular day may not have any tick + (e.g: day 31 on February). + month: '*' (any month), None (auto) or number in range `[1-12]` to + tick at one particular month of each year. + wday: '*' (any day), None (auto) or number in range `[0-6]` + (Sun-Sat) to tick at particular day of week. Can only be + specified if `day_of_month` is `None`. + + Returns: + A feature-less EventSet with timestamps at specified interval. + """ + from temporian.core.operators.tick_calendar import tick_calendar + + return tick_calendar( + self, + second=second, + minute=minute, + hour=hour, + mday=mday, + month=month, + wday=wday, + ) + def timestamps(self: EventSetOrNode) -> EventSetOrNode: """Converts an [`EventSet`][temporian.EventSet]'s timestamps into a `float64` feature. diff --git a/temporian/core/operators/BUILD b/temporian/core/operators/BUILD index d921c600b..8e519bb2f 100644 --- a/temporian/core/operators/BUILD +++ b/temporian/core/operators/BUILD @@ -362,6 +362,22 @@ py_library( ], ) +py_library( + name = "tick_calendar", + srcs = ["tick_calendar.py"], + srcs_version = "PY3", + deps = [ + ":base", + "//temporian/core:compilation", + "//temporian/core:operator_lib", + "//temporian/core:typing", + "//temporian/core/data:dtype", + "//temporian/core/data:node", + "//temporian/proto:core_py_proto", + "//temporian/utils:typecheck", + ], +) + py_library( name = "select_index_values", srcs = ["select_index_values.py"], diff --git a/temporian/core/operators/test/BUILD b/temporian/core/operators/test/BUILD index 7905eba58..1619970f8 100644 --- a/temporian/core/operators/test/BUILD +++ b/temporian/core/operators/test/BUILD @@ -53,3 +53,15 @@ py_test( "//temporian/core/operators:until_next", ], ) + +py_test( + name = "tick_calendar_test", + srcs = ["tick_calendar_test.py"], + srcs_version = "PY3", + deps = [ + # already_there/absl/testing:absltest + "//temporian/core/data:dtype", + "//temporian/core/data:node", + "//temporian/core/operators:tick_calendar", + ], +) diff --git a/temporian/core/operators/test/tick_calendar_test.py b/temporian/core/operators/test/tick_calendar_test.py new file mode 100644 index 000000000..b1e059779 --- /dev/null +++ b/temporian/core/operators/test/tick_calendar_test.py @@ -0,0 +1,154 @@ +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from absl.testing import absltest + +from temporian.core.data.node import input_node +from temporian.core.operators.tick_calendar import tick_calendar, TickCalendar + + +class TickCalendarOperatorTest(absltest.TestCase): + def setUp(self): + self._in = input_node([], is_unix_timestamp=True) + + def test_free_seconds_month(self): + output = tick_calendar(self._in, second="*", minute=1, hour=1, mday=31) + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, "*") + self.assertEqual(op.minute, 1) + self.assertEqual(op.hour, 1) + self.assertEqual(op.mday, 31) + self.assertEqual(op.month, "*") + self.assertEqual(op.wday, "*") + + def test_free_minutes(self): + output = tick_calendar(self._in, minute="*") + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, 0) + self.assertEqual(op.minute, "*") + self.assertEqual(op.hour, "*") + self.assertEqual(op.mday, "*") + self.assertEqual(op.month, "*") + self.assertEqual(op.wday, "*") + + def test_month_day(self): + output = tick_calendar(self._in, mday=5) + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, 0) + self.assertEqual(op.minute, 0) + self.assertEqual(op.hour, 0) + self.assertEqual(op.mday, 5) + self.assertEqual(op.month, "*") + self.assertEqual(op.wday, "*") + + def test_month(self): + output = tick_calendar(self._in, month=8) + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, 0) + self.assertEqual(op.minute, 0) + self.assertEqual(op.hour, 0) + self.assertEqual(op.mday, 1) + self.assertEqual(op.month, 8) + self.assertEqual(op.wday, "*") + + def test_weekdays(self): + output = tick_calendar(self._in, wday=6) + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, 0) + self.assertEqual(op.minute, 0) + self.assertEqual(op.hour, 0) + self.assertEqual(op.mday, "*") + self.assertEqual(op.month, "*") + self.assertEqual(op.wday, 6) + + def test_weekdays_month(self): + output = tick_calendar(self._in, wday=6, month=3) + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, 0) + self.assertEqual(op.minute, 0) + self.assertEqual(op.hour, 0) + self.assertEqual(op.mday, "*") + self.assertEqual(op.month, 3) + self.assertEqual(op.wday, 6) + + def test_weekdays_all_hours(self): + output = tick_calendar(self._in, wday=6, hour="*") + op = output.creator + assert isinstance(op, TickCalendar) + self.assertEqual(op.second, 0) + self.assertEqual(op.minute, 0) + self.assertEqual(op.hour, "*") + self.assertEqual(op.mday, "*") + self.assertEqual(op.month, "*") + self.assertEqual(op.wday, 6) + + def test_invalid_ranges(self): + for kwargs in ( + {"second": -1}, + {"second": 60}, + {"minute": -1}, + {"minute": 60}, + {"hour": -1}, + {"hour": 24}, + {"mday": 0}, + {"mday": 32}, + {"mday": -1}, # may be supported in the future + {"month": -1}, + {"month": 13}, + {"wday": -1}, + {"wday": 7}, + ): + with self.assertRaisesRegex( + ValueError, "Value should be '\*' or integer in range" + ): + _ = tick_calendar(self._in, **kwargs) # type: ignore + + def test_invalid_types(self): + for kwargs in ( + {"second": "1"}, + {"minute": "00"}, + {"hour": "00:00"}, + {"month": "January"}, + {"wday": "Sat"}, + ): + with self.assertRaisesRegex(ValueError, "Non matching type"): + _ = tick_calendar(self._in, **kwargs) # type: ignore + + def test_undefined_args(self): + with self.assertRaisesRegex( + ValueError, + "Can't set argument to None because previous and following", + ): + _ = tick_calendar(self._in, second=1, hour=1) # undefined min + + with self.assertRaisesRegex( + ValueError, + "Can't set argument to None because previous and following", + ): + _ = tick_calendar(self._in, second=1, month=1) + + with self.assertRaisesRegex( + ValueError, + "Can't set argument to None because previous and following", + ): + _ = tick_calendar(self._in, hour=0, month=1) + + +if __name__ == "__main__": + absltest.main() diff --git a/temporian/core/operators/tick_calendar.py b/temporian/core/operators/tick_calendar.py new file mode 100644 index 000000000..07e64136b --- /dev/null +++ b/temporian/core/operators/tick_calendar.py @@ -0,0 +1,259 @@ +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""TickCalendar operator class and public API function definitions.""" +from typing import Literal, Tuple, Optional, Union + +import numpy as np + +from temporian.core import operator_lib +from temporian.core.compilation import compile +from temporian.core.data.node import ( + EventSetNode, + create_node_new_features_new_sampling, +) +from temporian.core.operators.base import Operator +from temporian.core.typing import EventSetOrNode +from temporian.proto import core_pb2 as pb +from temporian.utils.typecheck import typecheck + +TypeWildCard = Literal["*"] + + +class TickCalendar(Operator): + def __init__( + self, + input: EventSetNode, + second: Union[int, TypeWildCard], + minute: Union[int, TypeWildCard], + hour: Union[int, TypeWildCard], + mday: Union[int, TypeWildCard], + month: Union[int, TypeWildCard], + wday: Union[int, TypeWildCard], + ): + super().__init__() + if not input.schema.is_unix_timestamp: + raise ValueError( + "Can only use tick_calendar on unix timestamp samplings" + ) + + # Attributes + self._second = self._check_arg(second, self.seconds_max_range()) + self._minute = self._check_arg(minute, self.minutes_max_range()) + self._hour = self._check_arg(hour, self.hours_max_range()) + self._mday = self._check_arg(mday, self.mday_max_range()) + self._month = self._check_arg(month, self.month_max_range()) + self._wday = self._check_arg(wday, self.wday_max_range()) + self.add_attribute("second", second) + self.add_attribute("minute", minute) + self.add_attribute("hour", hour) + self.add_attribute("mday", mday) + self.add_attribute("month", month) + self.add_attribute("wday", wday) + + self.add_input("input", input) + + self.add_output( + "output", + create_node_new_features_new_sampling( + features=[], + indexes=input.schema.indexes, + is_unix_timestamp=True, + creator=self, + ), + ) + + self.check() + + def _check_arg(self, arg_value, val_range): + if arg_value == "*" or ( + isinstance(arg_value, (int, np.integer)) + and arg_value >= val_range[0] + and arg_value <= val_range[1] + ): + return arg_value + raise ValueError( + f"Value should be '*' or integer in range {val_range}, got:" + f" {arg_value} (type {type(arg_value)})" + ) + + @property + def second(self) -> Union[int, TypeWildCard]: + # assert for typecheck + assert self._second == "*" or not isinstance(self._second, str) + return self._second + + @property + def minute(self) -> Union[int, TypeWildCard]: + # assert for typecheck + assert self._minute == "*" or not isinstance(self._minute, str) + return self._minute + + @property + def hour(self) -> Union[int, TypeWildCard]: + # assert for typecheck + assert self._hour == "*" or not isinstance(self._hour, str) + return self._hour + + @property + def mday(self) -> Union[int, TypeWildCard]: + # assert for typecheck + assert self._mday == "*" or not isinstance(self._mday, str) + return self._mday + + @property + def month(self) -> Union[int, TypeWildCard]: + # assert for typecheck + assert self._month == "*" or not isinstance(self._month, str) + return self._month + + @property + def wday(self) -> Union[int, TypeWildCard]: + # assert for typecheck + assert self._wday == "*" or not isinstance(self._wday, str) + return self._wday + + @classmethod + def seconds_max_range(cls) -> Tuple[int, int]: + return (0, 59) + + @classmethod + def minutes_max_range(cls) -> Tuple[int, int]: + return (0, 59) + + @classmethod + def hours_max_range(cls) -> Tuple[int, int]: + return (0, 23) + + @classmethod + def mday_max_range(cls) -> Tuple[int, int]: + return (1, 31) + + @classmethod + def month_max_range(cls) -> Tuple[int, int]: + return (1, 12) + + @classmethod + def wday_max_range(cls) -> Tuple[int, int]: + return (0, 6) + + @classmethod + def build_op_definition(cls) -> pb.OperatorDef: + return pb.OperatorDef( + key="TICK_CALENDAR", + attributes=[ + pb.OperatorDef.Attribute( + key="second", + type=pb.OperatorDef.Attribute.Type.ANY, + ), + pb.OperatorDef.Attribute( + key="minute", + type=pb.OperatorDef.Attribute.Type.ANY, + ), + pb.OperatorDef.Attribute( + key="hour", + type=pb.OperatorDef.Attribute.Type.ANY, + ), + pb.OperatorDef.Attribute( + key="mday", + type=pb.OperatorDef.Attribute.Type.ANY, + ), + pb.OperatorDef.Attribute( + key="month", + type=pb.OperatorDef.Attribute.Type.ANY, + ), + pb.OperatorDef.Attribute( + key="wday", + type=pb.OperatorDef.Attribute.Type.ANY, + ), + ], + inputs=[pb.OperatorDef.Input(key="input")], + outputs=[pb.OperatorDef.Output(key="output")], + ) + + +operator_lib.register_operator(TickCalendar) + + +@typecheck +@compile +def tick_calendar( + input: EventSetOrNode, + second: Optional[Union[int, TypeWildCard]] = None, + minute: Optional[Union[int, TypeWildCard]] = None, + hour: Optional[Union[int, TypeWildCard]] = None, + mday: Optional[Union[int, TypeWildCard]] = None, + month: Optional[Union[int, TypeWildCard]] = None, + wday: Optional[Union[int, TypeWildCard]] = None, +) -> EventSetOrNode: + # Don't allow empty args + if all(arg is None for arg in (second, minute, hour, mday, month, wday)): + raise ValueError("At least one argument must be provided (not None).") + + # All defined values must be consecutive (no gaps with None) + if wday is not None: + sorted_args = [second, minute, hour, wday] + else: + sorted_args = [second, minute, hour, mday, month] + for idx, arg in enumerate(sorted_args): + if ( + arg is None + and any(a is not None for a in sorted_args[:idx]) + and any(a is not None for a in sorted_args[idx + 1 :]) + ): + raise ValueError( + "Can't set argument to None because previous and" + " following arguments were specified. Set to '*' or an" + " integer value instead" + ) + + # prefer_free becomes True when next args should be set to '*' by default + # e.g: user sets only hour=1 -> second=0,minute=0, mday='*', month='*' + release_ranges = False + + # Always set second=0 by default + if second is None: + second = 0 + else: + release_ranges = True # fixed seconds, free minute, hour + + if minute is None: + minute = "*" if release_ranges else 0 + else: + release_ranges = True # fixed minutes, free hour, day, month + + if hour is None: + hour = "*" if release_ranges else 0 + else: + release_ranges = True + + if mday is None: + # If wday is specified, always leave mday free by default + free_mday = release_ranges or wday is not None + mday = "*" if free_mday else 1 + + # Always free range by default + month = "*" if month is None else month + wday = "*" if wday is None else wday + + return TickCalendar( + input=input, # type: ignore + second=second, + minute=minute, + hour=hour, + mday=mday, + month=month, + wday=wday, + ).outputs["output"] diff --git a/temporian/core/test/registered_operators_test.py b/temporian/core/test/registered_operators_test.py index 76a0120ff..692c145e4 100644 --- a/temporian/core/test/registered_operators_test.py +++ b/temporian/core/test/registered_operators_test.py @@ -95,6 +95,7 @@ def test_base(self): "SUBTRACTION", "SUBTRACTION_SCALAR", "TICK", + "TICK_CALENDAR", "TIMESTAMPS", "UNIQUE_TIMESTAMPS", "UNTIL_NEXT", diff --git a/temporian/implementation/numpy/operators/BUILD b/temporian/implementation/numpy/operators/BUILD index dbb573406..eb346dc32 100644 --- a/temporian/implementation/numpy/operators/BUILD +++ b/temporian/implementation/numpy/operators/BUILD @@ -33,6 +33,7 @@ py_library( ":select_index_values", ":since_last", ":tick", + ":tick_calendar", ":timestamps", ":unary", ":unique_timestamps", @@ -421,8 +422,24 @@ py_library( # already_there/numpy ":base", "//temporian/core/data:duration_utils", + "//temporian/implementation/numpy:implementation_lib", + "//temporian/implementation/numpy/data:event_set", + "//temporian/implementation/numpy_cc/operators:operators_cc", "//temporian/core/operators:where", + ], +) + +py_library( + name = "tick_calendar", + srcs = ["tick_calendar.py"], + srcs_version = "PY3", + deps = [ + # already_there/numpy + ":base", + "//temporian/core/data:duration_utils", + "//temporian/core/operators:tick_calendar", "//temporian/implementation/numpy:implementation_lib", "//temporian/implementation/numpy/data:event_set", + "//temporian/implementation/numpy_cc/operators:operators_cc", ], ) diff --git a/temporian/implementation/numpy/operators/__init__.py b/temporian/implementation/numpy/operators/__init__.py index df80f934b..62bf6475f 100644 --- a/temporian/implementation/numpy/operators/__init__.py +++ b/temporian/implementation/numpy/operators/__init__.py @@ -62,6 +62,7 @@ from temporian.implementation.numpy.operators import select_index_values from temporian.implementation.numpy.operators import since_last from temporian.implementation.numpy.operators import tick +from temporian.implementation.numpy.operators import tick_calendar from temporian.implementation.numpy.operators import timestamps from temporian.implementation.numpy.operators import unique_timestamps from temporian.implementation.numpy.operators import filter_moving_count diff --git a/temporian/implementation/numpy/operators/test/BUILD b/temporian/implementation/numpy/operators/test/BUILD index 8481025f3..20455f783 100644 --- a/temporian/implementation/numpy/operators/test/BUILD +++ b/temporian/implementation/numpy/operators/test/BUILD @@ -747,6 +747,22 @@ py_test( ], ) +py_test( + name = "tick_calendar_test", + srcs = ["tick_calendar_test.py"], + srcs_version = "PY3", + deps = [ + # already_there/absl/testing:absltest + ":utils", + "//temporian/core/data:dtype", + "//temporian/core/data:node", + "//temporian/core/data:schema", + "//temporian/implementation/numpy/data:io", + "//temporian/core/operators:tick_calendar", + "//temporian/implementation/numpy/operators:tick_calendar", + ], +) + py_test( name = "until_next_test", srcs = ["until_next_test.py"], diff --git a/temporian/implementation/numpy/operators/test/calendar_hour_test.py b/temporian/implementation/numpy/operators/test/calendar_hour_test.py index d86dfcec4..03f5328bb 100644 --- a/temporian/implementation/numpy/operators/test/calendar_hour_test.py +++ b/temporian/implementation/numpy/operators/test/calendar_hour_test.py @@ -26,15 +26,15 @@ from temporian.implementation.numpy.data.io import event_set from temporian.implementation.numpy.operators.test.utils import ( assertEqualEventSet, + SetTimezone, ) class CalendarHourNumpyImplementationTest(absltest.TestCase): """Test numpy implementation of calendar_hour operator.""" - def test_basic(self) -> None: - "Basic test with flat node." - input_evset = from_pandas( + def setUp(self): + self.input_evset = from_pandas( pd.DataFrame( data=[ [pd.to_datetime("1970-01-01 00:00:00", utc=True)], @@ -47,23 +47,36 @@ def test_basic(self) -> None: ), ) - output_evset = event_set( - timestamps=input_evset.get_arbitrary_index_data().timestamps, + self.output_evset = event_set( + timestamps=self.input_evset.get_arbitrary_index_data().timestamps, features={ "calendar_hour": np.array([0, 1, 1, 12, 23]).astype(np.int32), }, is_unix_timestamp=True, ) - operator = CalendarHourOperator(input_evset.node()) - impl = CalendarHourNumpyImplementation(operator) - output = impl.call(sampling=input_evset)["output"] + self.operator = CalendarHourOperator(self.input_evset.node()) + self.impl = CalendarHourNumpyImplementation(self.operator) - assertEqualEventSet(self, output, output_evset) + def test_basic(self) -> None: + "Basic test with flat node." + output = self.impl.call(sampling=self.input_evset)["output"] + + assertEqualEventSet(self, output, self.output_evset) self.assertTrue( output.get_arbitrary_index_data().features[0].dtype == np.int32 ) + def test_timezone_defined(self) -> None: + "Define TZ env var and check that it works identically" + with SetTimezone(): + output = self.impl.call(sampling=self.input_evset)["output"] + + assertEqualEventSet(self, output, self.output_evset) + self.assertTrue( + output.get_arbitrary_index_data().features[0].dtype == np.int32 + ) + if __name__ == "__main__": absltest.main() diff --git a/temporian/implementation/numpy/operators/test/tick_calendar_test.py b/temporian/implementation/numpy/operators/test/tick_calendar_test.py new file mode 100644 index 000000000..6d8b4bee5 --- /dev/null +++ b/temporian/implementation/numpy/operators/test/tick_calendar_test.py @@ -0,0 +1,227 @@ +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datetime import datetime, timedelta +from absl.testing import absltest + +from temporian.core.operators.tick_calendar import TickCalendar +from temporian.implementation.numpy.data.io import event_set +from temporian.implementation.numpy.operators.tick_calendar import ( + TickCalendarNumpyImplementation, +) +from temporian.implementation.numpy.operators.test.utils import ( + assertEqualEventSet, + testOperatorAndImp, + SetTimezone, +) + + +class TickCalendarOperatorTest(absltest.TestCase): + def test_start_end_00_00(self): + evset = event_set( + timestamps=[ + "2020-01-01 00:00", + "2020-03-01 00:00", + ], + ) + node = evset.node() + + # Expected output + expected_output = event_set( + timestamps=[ + "2020-01-01 00:00", + "2020-02-01 00:00", + "2020-03-01 00:00", + ], + ) + + # Run op + op = TickCalendar( + input=node, + second=0, + minute=0, + hour=0, + mday=1, + month="*", + wday="*", + ) + instance = TickCalendarNumpyImplementation(op) + testOperatorAndImp(self, op, instance) + output = instance.call(input=evset)["output"] + + assertEqualEventSet(self, output, expected_output) + + # Check that it's exactly the same with env TZ!=UTC defined + with SetTimezone(): + instance = TickCalendarNumpyImplementation(op) + testOperatorAndImp(self, op, instance) + output = instance.call(input=evset)["output"] + + assertEqualEventSet(self, output, expected_output) + + def test_start_end_offset(self): + evset = event_set( + timestamps=[ + "2020-01-01 13:04", + "2020-03-06 19:35", + ], + ) + node = evset.node() + + # Expected output + expected_output = event_set( + timestamps=[ + "2020-02-01 00:00", + "2020-03-01 00:00", + ], + ) + + # Run op + op = TickCalendar( + input=node, + second=0, + minute=0, + hour=0, + mday=1, + month="*", + wday="*", + ) + instance = TickCalendarNumpyImplementation(op) + testOperatorAndImp(self, op, instance) + output = instance.call(input=evset)["output"] + + assertEqualEventSet(self, output, expected_output) + + def test_end_of_month_seconds(self): + # All seconds at mday=31, should only be valid for months 1, 3, 5 + + evset = event_set( + timestamps=[ + datetime(2020, 1, 1, 0, 0, 0), + datetime(2020, 6, 1, 0, 0, 0), + ], + ) + node = evset.node() + + # Expected output + def seconds_at_01_01(day, month): + return [datetime(2020, day, month, 1, 1, sec) for sec in range(60)] + + expected_output = event_set( + timestamps=seconds_at_01_01(1, 31) + + seconds_at_01_01(3, 31) + + seconds_at_01_01(5, 31), + ) + + # Run op + op = TickCalendar( + input=node, + second="*", + minute=1, + hour=1, + mday=31, + month="*", + wday="*", + ) + instance = TickCalendarNumpyImplementation(op) + testOperatorAndImp(self, op, instance) + output = instance.call(input=evset)["output"] + + assertEqualEventSet(self, output, expected_output) + + def test_end_of_year_minutes(self): + # All hours/minutes from 30/12/2019 to 2/1/2020 + + evset = event_set( + timestamps=[ + # 4 days: 2 on 2019 + 2 on 2020 + datetime(2019, 12, 30, 0, 0, 0), + datetime(2020, 1, 2, 23, 59, 59), # 2/1 at 23:59:59 + ], + ) + node = evset.node() + + # Expected timestamps: all hours/minutes in 4 days + timestamps = [] + for day, month, year in [ + (30, 12, 2019), + (31, 12, 2019), + (1, 1, 2020), + (2, 1, 2020), + ]: + for hour in range(24): + for minute in range(60): + timestamps += [datetime(year, month, day, hour, minute, 0)] + expected_output = event_set( + timestamps=timestamps, + ) + + # Run op + op = TickCalendar( + input=node, + second=0, + minute="*", + hour="*", + mday="*", + month="*", + wday="*", + ) + instance = TickCalendarNumpyImplementation(op) + testOperatorAndImp(self, op, instance) + output = instance.call(input=evset)["output"] + + assertEqualEventSet(self, output, expected_output) + + def test_weekdays(self): + # All exact hours from all saturdays in 2023 + + evset = event_set( + timestamps=[ + datetime(2023, 1, 1), + datetime(2023, 12, 31, 23, 0, 0), + ], + ) + node = evset.node() + + # Expected timestamps: all hours/minutes in 4 days + timestamps = [] + day = datetime(2023, 1, 7) # First saturday + one_week = timedelta(days=7) + while day.year < 2024: + for hour in range(24): + timestamps += [day + timedelta(hours=hour)] + day += one_week + expected_output = event_set( + timestamps=timestamps, + ) + + # Run op + op = TickCalendar( + input=node, + second=0, + minute=0, + hour="*", + wday=6, + mday="*", + month="*", + ) + instance = TickCalendarNumpyImplementation(op) + testOperatorAndImp(self, op, instance) + output = instance.call(input=evset)["output"] + + assertEqualEventSet(self, output, expected_output) + + +if __name__ == "__main__": + absltest.main() diff --git a/temporian/implementation/numpy/operators/test/utils.py b/temporian/implementation/numpy/operators/test/utils.py index 9d4d80220..8706c3696 100644 --- a/temporian/implementation/numpy/operators/test/utils.py +++ b/temporian/implementation/numpy/operators/test/utils.py @@ -1,80 +1,96 @@ -# Copyright 2021 Google LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from absl.testing import absltest - -from temporian.implementation.numpy.data.event_set import EventSet -from temporian.implementation.numpy.operators.base import OperatorImplementation -from temporian.core.operators.base import Operator -from temporian.core import serialization - - -def assertEqualEventSet( - test: absltest.TestCase, real: EventSet, expected: EventSet -): - """Asserts the equality between real and expected. - - Prints a nice message in case of error. - """ - - test.assertEqual( - real, - expected, - ( - "\n==========\nREAL:\n==========\n" - f"{real}" - "\n==========\nEXPECTED:\n==========\n" - f"{expected}" - ), - ) - - -def assertEqualDFRandomRowOrder( - test: absltest.TestCase, real: "pd.DataFrame", expected: "pd.DataFrame" -): - row_real = set([str(row.to_dict()) for _, row in real.iterrows()]) - row_expected = set([str(row.to_dict()) for _, row in expected.iterrows()]) - test.assertEqual( - row_real, - row_expected, - ( - "\n==========\nREAL:\n==========\n" - f"{real}" - "\n==========\nEXPECTED:\n==========\n" - f"{expected}" - ), - ) - - -def testOperatorAndImp( - test: absltest.TestCase, op: Operator, imp: OperatorImplementation -): - """Tests an operator and its implementation. - - Currently test: - - Serialization / unserialization of the operator. - """ - - # TODO: Add tests related to the implementation. - del imp - - serialized_op = serialization._serialize_operator(op) - - nodes = {} - for node in op.inputs.values(): - nodes[serialization._identifier(node)] = node - for node in op.outputs.values(): - nodes[serialization._identifier(node)] = node - - _ = serialization._unserialize_operator(serialized_op, nodes) +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +from absl.testing import absltest + +from temporian.implementation.numpy.data.event_set import EventSet +from temporian.implementation.numpy.operators.base import OperatorImplementation +from temporian.core.operators.base import Operator +from temporian.core import serialization + + +def assertEqualEventSet( + test: absltest.TestCase, real: EventSet, expected: EventSet +): + """Asserts the equality between real and expected. + + Prints a nice message in case of error. + """ + + test.assertEqual( + real, + expected, + ( + "\n==========\nREAL:\n==========\n" + f"{real}" + "\n==========\nEXPECTED:\n==========\n" + f"{expected}" + ), + ) + + +def assertEqualDFRandomRowOrder( + test: absltest.TestCase, real: "pd.DataFrame", expected: "pd.DataFrame" +): + row_real = set([str(row.to_dict()) for _, row in real.iterrows()]) + row_expected = set([str(row.to_dict()) for _, row in expected.iterrows()]) + test.assertEqual( + row_real, + row_expected, + ( + "\n==========\nREAL:\n==========\n" + f"{real}" + "\n==========\nEXPECTED:\n==========\n" + f"{expected}" + ), + ) + + +def testOperatorAndImp( + test: absltest.TestCase, op: Operator, imp: OperatorImplementation +): + """Tests an operator and its implementation. + + Currently test: + - Serialization / unserialization of the operator. + """ + + # TODO: Add tests related to the implementation. + del imp + + serialized_op = serialization._serialize_operator(op) + + nodes = {} + for node in op.inputs.values(): + nodes[serialization._identifier(node)] = node + for node in op.outputs.values(): + nodes[serialization._identifier(node)] = node + + _ = serialization._unserialize_operator(serialized_op, nodes) + + +class SetTimezone: + def __init__(self, timezone: str = "America/Montevideo"): + self._tz = timezone + self._restore_tz = "" + + def __enter__(self): + self._restore_tz = os.environ.get("TZ", "") + os.environ["TZ"] = self._tz + time.tzset() + + def __exit__(self, *args): + os.environ["TZ"] = self._restore_tz + time.tzset() diff --git a/temporian/implementation/numpy/operators/tick_calendar.py b/temporian/implementation/numpy/operators/tick_calendar.py new file mode 100644 index 000000000..7e46500fd --- /dev/null +++ b/temporian/implementation/numpy/operators/tick_calendar.py @@ -0,0 +1,108 @@ +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Implementation for the TickCalendar operator.""" + +from typing import Dict, Literal, Union, Tuple + +import numpy as np + +from temporian.implementation.numpy.data.event_set import IndexData, EventSet +from temporian.core.operators.tick_calendar import TickCalendar +from temporian.implementation.numpy import implementation_lib +from temporian.implementation.numpy.operators.base import OperatorImplementation +from temporian.implementation.numpy_cc.operators import operators_cc + + +class TickCalendarNumpyImplementation(OperatorImplementation): + def __init__(self, operator: TickCalendar) -> None: + assert isinstance(operator, TickCalendar) + super().__init__(operator) + + def _get_arg_range( + self, + arg_value: Union[int, Literal["*"]], + val_range: Tuple[int, int], + ): + if arg_value == "*": + range_ini, range_end = val_range + else: + range_ini = range_end = arg_value + + return range_ini, range_end + + def __call__(self, input: EventSet) -> Dict[str, EventSet]: + assert isinstance(self.operator, TickCalendar) + output_schema = self.output_schema("output") + + # Create output EventSet + output_evset = EventSet(data={}, schema=output_schema) + + # Get range for each argument + second_range = self._get_arg_range( + self.operator.second, self.operator.seconds_max_range() + ) + minute_range = self._get_arg_range( + self.operator.minute, self.operator.minutes_max_range() + ) + hour_range = self._get_arg_range( + self.operator.hour, self.operator.hours_max_range() + ) + mday_range = self._get_arg_range( + self.operator.mday, self.operator.mday_max_range() + ) + month_range = self._get_arg_range( + self.operator.month, self.operator.month_max_range() + ) + wday_range = self._get_arg_range( + self.operator.wday, self.operator.wday_max_range() + ) + + # Fill output EventSet's data + for index_key, index_data in input.data.items(): + if len(index_data.timestamps) == 0: + dst_timestamps = np.array([], dtype=np.float64) + else: + dst_timestamps = operators_cc.tick_calendar( + start_timestamp=index_data.timestamps[0], + end_timestamp=index_data.timestamps[-1], + min_second=second_range[0], + max_second=second_range[1], + min_minute=minute_range[0], + max_minute=minute_range[1], + min_hour=hour_range[0], + max_hour=hour_range[1], + min_mday=mday_range[0], + max_mday=mday_range[1], + min_month=month_range[0], + max_month=month_range[1], + min_wday=wday_range[0], + max_wday=wday_range[1], + ) + output_evset.set_index_value( + index_key, + IndexData( + features=[], + timestamps=dst_timestamps, + schema=output_schema, + ), + ) + + return {"output": output_evset} + + +implementation_lib.register_operator_implementation( + TickCalendar, TickCalendarNumpyImplementation +) diff --git a/temporian/implementation/numpy/test/registered_operators_test.py b/temporian/implementation/numpy/test/registered_operators_test.py index 8b2f9b84d..f14d6f6fe 100644 --- a/temporian/implementation/numpy/test/registered_operators_test.py +++ b/temporian/implementation/numpy/test/registered_operators_test.py @@ -93,6 +93,7 @@ def test_base(self): "SUBTRACTION", "SUBTRACTION_SCALAR", "TICK", + "TICK_CALENDAR", "TIMESTAMPS", "UNIQUE_TIMESTAMPS", "UNTIL_NEXT", diff --git a/temporian/implementation/numpy_cc/operators/BUILD b/temporian/implementation/numpy_cc/operators/BUILD index 01be29709..1ab2c41e5 100644 --- a/temporian/implementation/numpy_cc/operators/BUILD +++ b/temporian/implementation/numpy_cc/operators/BUILD @@ -45,6 +45,13 @@ pybind_library( deps = [":common"], ) +pybind_library( + name = "tick_calendar", + srcs = ["tick_calendar.cc"], + hdrs = ["tick_calendar.h"], + deps = [":common"], +) + pybind_library( name = "until_next", srcs = ["until_next.cc"], @@ -68,6 +75,7 @@ pybind_extension( ":join", ":resample", ":since_last", + ":tick_calendar", ":until_next", ":window", ], diff --git a/temporian/implementation/numpy_cc/operators/pyinit.cc b/temporian/implementation/numpy_cc/operators/pyinit.cc index 84c513431..dbb4c8737 100644 --- a/temporian/implementation/numpy_cc/operators/pyinit.cc +++ b/temporian/implementation/numpy_cc/operators/pyinit.cc @@ -6,6 +6,7 @@ #include "temporian/implementation/numpy_cc/operators/join.h" #include "temporian/implementation/numpy_cc/operators/resample.h" #include "temporian/implementation/numpy_cc/operators/since_last.h" +#include "temporian/implementation/numpy_cc/operators/tick_calendar.h" #include "temporian/implementation/numpy_cc/operators/until_next.h" #include "temporian/implementation/numpy_cc/operators/window.h" @@ -19,6 +20,7 @@ PYBIND11_MODULE(operators_cc, m) { init_window(m); init_join(m); init_add_index(m); + init_tick_calendar(m); init_filter_moving_count(m); init_until_next(m); } diff --git a/temporian/implementation/numpy_cc/operators/tick_calendar.cc b/temporian/implementation/numpy_cc/operators/tick_calendar.cc new file mode 100644 index 000000000..ca7c9a5a9 --- /dev/null +++ b/temporian/implementation/numpy_cc/operators/tick_calendar.cc @@ -0,0 +1,134 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "temporian/implementation/numpy_cc/operators/common.h" + +namespace { +namespace py = pybind11; + +py::array_t tick_calendar( + const double start_timestamp, // min date + const double end_timestamp, // max date + const int min_second, const int max_second, // second range + const int min_minute, const int max_minute, // minute range + const int min_hour, const int max_hour, // hours range + const int min_mday, const int max_mday, // month days + const int min_month, const int max_month, // month range + const int min_wday, const int max_wday // weekdays +) { + // Ticks list + std::vector ticks; + + // Date range + const long start_t = (long)std::floor(start_timestamp); + const long end_t = (long)std::floor(end_timestamp); + + std::tm start_utc = *std::gmtime(&start_t); + + int year = start_utc.tm_year; // from 1900 + int month = std::max(start_utc.tm_mon + 1, min_month); // zero-based tm_mon + int mday = std::max(start_utc.tm_mday, min_mday); // 1-31 + int hour = std::max(start_utc.tm_hour, min_hour); + int minute = std::max(start_utc.tm_min, min_minute); + int second = std::max(start_utc.tm_sec, min_second); + + // Workaround to get timestamp from UTC datetimes (mktime depends on timezone) + std::tm start_local = *std::localtime(&start_t); + const int offset_tzone = std::mktime(&start_utc) - std::mktime(&start_local); + + bool in_range = true; + while (in_range) { + while (month <= max_month && in_range) { + while (mday <= max_mday && in_range) { + while (hour <= max_hour && in_range) { + while (minute <= max_minute && in_range) { + while (second <= max_second && in_range) { + std::tm tm_date = {}; + tm_date.tm_year = year; // Since 1900 + tm_date.tm_mon = month - 1; // zero-based + tm_date.tm_mday = mday; + tm_date.tm_hour = hour; + tm_date.tm_min = minute; + tm_date.tm_sec = second; + tm_date.tm_isdst = 0; + tm_date.tm_gmtoff = start_local.tm_gmtoff; + + // This assumes that the date is in local timezone + const std::time_t time_local = std::mktime(&tm_date); + + // Valid date + if (time_local != -1 && tm_date.tm_mday == mday) { + // Remove timezone offset from timestamp + const std::time_t time_utc = time_local - offset_tzone; + + // Finish condition + if (time_utc > end_t) { + in_range = false; + break; + } + + // Check weekday match (mktime sets it properly) + if (tm_date.tm_wday >= min_wday && + tm_date.tm_wday <= max_wday) { + ticks.push_back(time_utc); + } + } else { + // Invalid date (e.g: 31/4) + second = max_second; // avoid unnecessary loops + minute = max_minute; + hour = max_hour; + } + second++; + } + second = min_second; + minute++; + } + second = min_second; + minute = min_minute; + hour++; + } + second = min_second; + minute = min_minute; + hour = min_hour; + mday++; + } + second = min_second; + minute = min_minute; + hour = min_hour; + mday = min_mday; + month++; + } + second = min_second; + minute = min_minute; + hour = min_hour; + mday = min_mday; + month = min_month; + year++; + } + // TODO: optimize mday += 7 on specific wdays + + // Allocate output array + // TODO: can we avoid this data copy? + py::array_t result(ticks.size()); + std::copy(ticks.begin(), ticks.end(), result.mutable_data()); + return result; +} + +} // namespace + +void init_tick_calendar(py::module &m) { + m.def("tick_calendar", &tick_calendar, "", py::arg("start_timestamp"), + py::arg("end_timestamp"), py::arg("min_second"), py::arg("max_second"), + py::arg("min_minute"), py::arg("max_minute"), py::arg("min_hour"), + py::arg("max_hour"), py::arg("min_mday"), py::arg("max_mday"), + py::arg("min_month"), py::arg("max_month"), py::arg("min_wday"), + py::arg("max_wday")); +} diff --git a/temporian/implementation/numpy_cc/operators/tick_calendar.h b/temporian/implementation/numpy_cc/operators/tick_calendar.h new file mode 100644 index 000000000..9a08c0611 --- /dev/null +++ b/temporian/implementation/numpy_cc/operators/tick_calendar.h @@ -0,0 +1,4 @@ +#include +#include + +void init_tick_calendar(pybind11::module &m); diff --git a/temporian/utils/typecheck.py b/temporian/utils/typecheck.py index da5659738..0bb46b9da 100644 --- a/temporian/utils/typecheck.py +++ b/temporian/utils/typecheck.py @@ -6,7 +6,7 @@ import logging -from typing import List, Set, Dict, Optional, Union, Tuple, Any +from typing import List, Set, Dict, Optional, Union, Tuple, Any, Literal import inspect import typing @@ -119,6 +119,13 @@ def _check_annotation(trace: _Trace, is_compiled: bool, value, annotation): origin = typing.get_origin(annotation) assert origin is not None + # Literal values check (e.g: Literal['*']) + if origin is Literal: + # Check param value in the allowed literal values + if value not in typing.get_args(annotation): + trace.exception(_base_error(value, annotation)) + return + if origin is not Union: if not isinstance(value, origin): # The origin (e.g. "list" in "List[int]") is wrong. @@ -208,6 +215,7 @@ def _check_annotation_union( trace.exception( f'Non matching type for "{type(value)}" in the union {annotation_args}.' + f' The value is "{value}".' )