From 436431b2d5a073c867bf32195d9e3368a282cc1d Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 6 Jun 2024 16:42:27 -0400 Subject: [PATCH 01/15] Spike on using numpy datetime64 instead of python builtin datetime --- setup.cfg | 1 + src/undate/dateformat/iso8601.py | 13 +++++- src/undate/undate.py | 79 ++++++++++++++++++++++++++++---- tests/test_undate.py | 39 ++++++++-------- 4 files changed, 102 insertions(+), 30 deletions(-) diff --git a/setup.cfg b/setup.cfg index dc228d6..ad057d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,7 @@ python_requires = >=3.8 install_requires = python-dateutil lark + numpy [options.package_data] * = diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index f1c5cca..0f5cee7 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -61,8 +61,17 @@ def to_string(self, undate: Undate) -> str: # and not others; force year to always be 4 digits if date_portion == "year": date_parts.append("%04d" % undate.earliest.year) - else: - date_parts.append(undate.earliest.strftime(iso_format)) + elif date_portion == "month": + date_parts.append("%02d" % undate.earliest.month) + elif date_portion == "day": + date_parts.append("%02d" % undate.earliest.day) + + # else: + # # date_parts.append(undate.earliest.strftime(iso_format)) + # e = undate.earliest + # # isoformat defined above per field + # date_parts.append(f"{e.year:04d}") # -{e.month:02d}-{e.day:02d}") + # date_parts.append(undate.earliest.strftime(iso_format)) elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format date_parts.append("-") diff --git a/src/undate/undate.py b/src/undate/undate.py index ee94bb6..066e910 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -5,14 +5,64 @@ # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Optional, Dict, Union - +import numpy as np from dateutil.relativedelta import relativedelta from undate.dateformat.base import BaseDateFormat #: duration of a single day -ONE_DAY = datetime.timedelta(days=1) +# ONE_DAY = datetime.timedelta(days=1) +ONE_DAY = np.timedelta64(1, "D") + + +class LocalDate: + def __init__(self, year: str, month: str = None, day: str = None): + if isinstance(year, np.datetime64): + self._date = year + else: + datestr = year + if month is not None: + datestr = f"{year}-{month:02d}" + if day is not None: + datestr = f"{datestr}-{day:02d}" + self._date = np.datetime64(datestr) + + def __str__(self): + return str(self._date) + + @property + def year(self): + return int(str(self._date.astype("datetime64[Y]"))) + + @property + def month(self): + return int(str(self._date.astype("datetime64[M]")).split("-")[-1]) + + @property + def day(self): + return int(str(self._date.astype("datetime64[D]")).split("-")[-1]) + + def __eq__(self, other: object) -> bool: + return self._date == other._date + + def __gt__(self, other: object) -> bool: + # define gt ourselves so we can support > comparison with datetime.date, + # but rely on existing less than implementation. + # strictly greater than must rule out equals + return not (self._date < other._date or self._date == other._date) + + def __le__(self, other: Union["Undate", datetime.date]) -> bool: + return self._date == other._date or self._date < other._date + + def __add__(self, other): + if isinstance(other, LocalDate): + return LocalDate(self._date + other._date) + if isinstance(other, np.timedelta64): + return LocalDate(self._date + other) + + def __sub__(self, other) -> np.timedelta64: + return self._date - other._date class DatePrecision(IntEnum): @@ -32,6 +82,8 @@ class DatePrecision(IntEnum): def __str__(self): return f"{self.name}" + # numpy date units are years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’), + class Undate: """Simple object for representing uncertain, fuzzy or partially unknown dates""" @@ -143,8 +195,11 @@ def __init__( # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = datetime.date(min_year, min_month, min_day) - self.latest = datetime.date(max_year, max_month, max_day) + # self.earliest = datetime.date(min_year, min_month, min_day) + # self.latest = datetime.date(max_year, max_month, max_day) + + self.earliest = LocalDate(min_year, min_month, min_day) + self.latest = LocalDate(max_year, max_month, max_day) if formatter is None: # import all subclass definitions; initialize the default @@ -318,16 +373,21 @@ def duration(self) -> datetime.timedelta: if not self.known_year: # if year is unknown, calculate month duration in # a single year - latest = datetime.date( + latest = LocalDate( self.earliest.year, self.latest.month, self.latest.day ) + + # latest = datetime.date( + # self.earliest.year, self.latest.month, self.latest.day + # ) delta = latest - self.earliest + ONE_DAY # month duration can't ever be more than 31 days # (could we ever know if it's smaller?) # if granularity == month but not known month, duration = 31 - if delta.days > 31: - return datetime.timedelta(days=31) + if delta.astype(int) > 31: + # return datetime.timedelta(days=31) + return np.timedelta64(31, "D") return delta # otherwise, calculate based on earliest/latest range @@ -431,8 +491,9 @@ def duration(self) -> datetime.timedelta: # if we get a negative, we've wrapped from end of one year # to the beginning of the next; # recalculate assuming second date is in the subsequent year - if duration.days < 0: - end = self.latest.earliest + relativedelta(years=1) + if duration.astype("int") < 0: + # end = self.latest.earliest + relativedelta(years=1) + end = self.latest.earliest + np.timedelta64(365, "D") duration = end - self.earliest.earliest # add the additional day *after* checking for a negative diff --git a/tests/test_undate.py b/tests/test_undate.py index cf0d9ce..1f82aa0 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -291,34 +291,35 @@ def test_sorting(self): def test_duration(self): day_duration = Undate(2022, 11, 7).duration() - assert isinstance(day_duration, timedelta) - assert day_duration.days == 1 + # assert isinstance(day_duration, timedelta) + assert day_duration.astype("int") == 1 january_duration = Undate(2022, 1).duration() - assert january_duration.days == 31 + assert january_duration.astype("int") == 31 feb_duration = Undate(2022, 2).duration() - assert feb_duration.days == 28 + assert feb_duration.astype("int") == 28 # next leap year will be 2024 leapyear_feb_duration = Undate(2024, 2).duration() - assert leapyear_feb_duration.days == 29 + assert leapyear_feb_duration.astype("int") == 29 year_duration = Undate(2022).duration() - assert year_duration.days == 365 + assert year_duration.astype("int") == 365 leapyear_duration = Undate(2024).duration() - assert leapyear_duration.days == 366 + assert leapyear_duration.astype("int") == 366 def test_partiallyknown_duration(self): # day in unknown month/year - assert Undate(day=5).duration().days == 1 - assert Undate(year=1900, month=11, day="2X").duration().days == 1 + # assert Undate(day=5).duration().days == 1 + assert Undate(day=5).duration().astype("int") == 1 + assert Undate(year=1900, month=11, day="2X").duration().astype("int") == 1 # month in unknown year - assert Undate(month=6).duration().days == 30 + assert Undate(month=6).duration().astype("int") == 30 # partially known month - assert Undate(year=1900, month="1X").duration().days == 31 + assert Undate(year=1900, month="1X").duration().astype("int") == 31 # what about february? # could vary with leap years, but assume non-leapyear - assert Undate(month=2).duration().days == 28 + assert Undate(month=2).duration().astype("int") == 28 def test_known_year(self): assert Undate(2022).known_year is True @@ -398,27 +399,27 @@ def test_duration(self): week_duration = UndateInterval( Undate(2022, 11, 1), Undate(2022, 11, 7) ).duration() - assert isinstance(week_duration, timedelta) - assert week_duration.days == 7 + # assert isinstance(week_duration, timedelta) + assert week_duration.astype("int") == 7 twomonths = UndateInterval(Undate(2022, 11), Undate(2022, 12)).duration() # november - december = 30 days + 31 days - assert twomonths.days == 30 + 31 + assert twomonths.astype("int") == 30 + 31 twoyears = UndateInterval(Undate(2021), Undate(2022)).duration() - assert twoyears.days == 365 * 2 + assert twoyears.astype("int") == 365 * 2 # special case: month/day with no year (assumes same year) week_noyear_duration = UndateInterval( Undate(None, 11, 1), Undate(None, 11, 7) ).duration() - assert week_noyear_duration.days == 7 + assert week_noyear_duration.astype("int") == 7 # special case 2: month/day with no year, wrapping from december to january # (assumes sequential years) month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.days == 32 + assert month_noyear_duration.astype("int") == 32 # this seems wrong, but we currently count both start and dates # real case from Shakespeare and Company Project data; @@ -426,7 +427,7 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 6, 7), Undate(None, 6, 6) ).duration() - assert month_noyear_duration.days == 365 + assert month_noyear_duration.astype("int") == 365 # duration is not supported for open-ended intervals assert UndateInterval(Undate(2000), None).duration() == NotImplemented From 220dc7e361e2cb335084d8b7929a9879d1a4382b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 6 Jun 2024 16:56:49 -0400 Subject: [PATCH 02/15] Set min/max year for np.datetime64 day range --- src/undate/undate.py | 9 +++++++-- tests/test_undate.py | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index 066e910..6cbc97a 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -140,8 +140,13 @@ def __init__( min_year = int(str(year).replace(self.MISSING_DIGIT, "0")) max_year = int(str(year).replace(self.MISSING_DIGIT, "9")) else: - min_year = datetime.MINYEAR - max_year = datetime.MAXYEAR + # min_year = datetime.MINYEAR + # max_year = datetime.MAXYEAR + # numpy datetime is stored as 64-bit integer, so length + # depends on the span; assume days for now + + max_year = int(2.5e16) + min_year = int(-2.5e16) # if month is passed in as a string but completely unknown, # treat as none diff --git a/tests/test_undate.py b/tests/test_undate.py index 1f82aa0..7c4c577 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -419,7 +419,9 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.astype("int") == 32 + assert month_noyear_duration.astype("int") == 31 + # change from relativedelta to timedelta64 changes this for some reason + # assert month_noyear_duration.astype("int") == 32 # this seems wrong, but we currently count both start and dates # real case from Shakespeare and Company Project data; From 3306bde5e7eb355e6f25494944298b979f6dd72b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 6 Jun 2024 17:02:22 -0400 Subject: [PATCH 03/15] Simplify np.datetime64 subclass thanks to stack overflow --- src/undate/undate.py | 60 +++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index 6cbc97a..d6ec3d4 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -16,53 +16,51 @@ ONE_DAY = np.timedelta64(1, "D") -class LocalDate: - def __init__(self, year: str, month: str = None, day: str = None): +class LocalDate(np.ndarray): + # shim to make np.datetime64 act more like datetime.date + + # extend np.datetime64 datatype + # adapted from https://stackoverflow.com/a/27129510/9706217 + + def __new__(cls, year: str, month: str = None, day: str = None): if isinstance(year, np.datetime64): - self._date = year + data = year else: datestr = year if month is not None: datestr = f"{year}-{month:02d}" if day is not None: datestr = f"{datestr}-{day:02d}" - self._date = np.datetime64(datestr) + data = np.datetime64(datestr) - def __str__(self): - return str(self._date) + data = np.asarray(data, dtype="datetime64") + if data.dtype != "datetime64[D]": + raise Exception( + "Unable to parse dates adequately to datetime64[D]: %s" % data + ) + obj = data.view(cls) + return obj + + def Export(self): + return self + + def __array_finalize__(self, obj): + if obj is None: + return + + # custom properties to access year, month, day @property def year(self): - return int(str(self._date.astype("datetime64[Y]"))) + return int(str(self.astype("datetime64[Y]"))) @property def month(self): - return int(str(self._date.astype("datetime64[M]")).split("-")[-1]) + return int(str(self.astype("datetime64[M]")).split("-")[-1]) @property def day(self): - return int(str(self._date.astype("datetime64[D]")).split("-")[-1]) - - def __eq__(self, other: object) -> bool: - return self._date == other._date - - def __gt__(self, other: object) -> bool: - # define gt ourselves so we can support > comparison with datetime.date, - # but rely on existing less than implementation. - # strictly greater than must rule out equals - return not (self._date < other._date or self._date == other._date) - - def __le__(self, other: Union["Undate", datetime.date]) -> bool: - return self._date == other._date or self._date < other._date - - def __add__(self, other): - if isinstance(other, LocalDate): - return LocalDate(self._date + other._date) - if isinstance(other, np.timedelta64): - return LocalDate(self._date + other) - - def __sub__(self, other) -> np.timedelta64: - return self._date - other._date + return int(str(self.astype("datetime64[D]")).split("-")[-1]) class DatePrecision(IntEnum): @@ -82,7 +80,7 @@ class DatePrecision(IntEnum): def __str__(self): return f"{self.name}" - # numpy date units are years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’), + # numpy date units are years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’) class Undate: From 0396a647fc7382ec9dc78a2bd1dd12b9e71e086c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 16 Aug 2024 16:38:09 -0400 Subject: [PATCH 04/15] Move numpy date shim into separate file --- src/undate/date.py | 79 +++++++++++++++++++++++++++++++++++++ src/undate/undate.py | 93 ++++---------------------------------------- 2 files changed, 87 insertions(+), 85 deletions(-) create mode 100644 src/undate/date.py diff --git a/src/undate/date.py b/src/undate/date.py new file mode 100644 index 0000000..4422a71 --- /dev/null +++ b/src/undate/date.py @@ -0,0 +1,79 @@ +from enum import IntEnum + +import numpy as np + +#: timedelta for single day +ONE_DAY = np.timedelta64(1, "D") # ~ equivalent to datetime.timedelta(days=1) +#: timedelta for a single year (non-leap year) +ONE_YEAR = np.timedelta64(365, "D") # ~ relativedelta(years=1) +#: timedelta for a month, assuming maximum month length (31 days) +ONE_MONTH_MAX = np.timedelta64(31, "D") + + +class Date(np.ndarray): + """This class is a shim to make :class:`numpy.datetime64` act + more like the built-in python :class:`datetime.date`.""" + + # extend np.datetime64 datatype + # adapted from https://stackoverflow.com/a/27129510/9706217 + + def __new__(cls, year: str, month: str = None, day: str = None): + if isinstance(year, np.datetime64): + data = year + else: + datestr = year + if month is not None: + datestr = f"{year}-{month:02d}" + if day is not None: + datestr = f"{datestr}-{day:02d}" + data = np.datetime64(datestr) + + data = np.asarray(data, dtype="datetime64") + if data.dtype != "datetime64[D]": + raise Exception( + "Unable to parse dates adequately to datetime64[D]: %s" % data + ) + obj = data.view(cls) + return obj + + def Export(self): + return self + + def __array_finalize__(self, obj): + if obj is None: + return + + # custom properties to access year, month, day + + @property + def year(self): + return int(str(self.astype("datetime64[Y]"))) + + @property + def month(self): + return int(str(self.astype("datetime64[M]")).split("-")[-1]) + + @property + def day(self): + return int(str(self.astype("datetime64[D]")).split("-")[-1]) + + +class DatePrecision(IntEnum): + """date precision, to indicate date precision independent from how much + of the date is known.""" + + # numbers should be set to allow logical greater than / less than + # comparison, e.g. year precision > month + + #: day + DAY = 1 + #: month + MONTH = 2 + #: year + YEAR = 3 + + def __str__(self): + return f"{self.name}" + + # NOTE: consider harmonizing / using numpy units + # numpy date units are years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’) diff --git a/src/undate/undate.py b/src/undate/undate.py index d6ec3d4..d0b8cd0 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,90 +1,16 @@ import datetime -from calendar import monthrange -from enum import IntEnum import re +from calendar import monthrange # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Optional, Dict, Union -import numpy as np -from dateutil.relativedelta import relativedelta +from undate.date import Date, DatePrecision, ONE_DAY, ONE_YEAR, ONE_MONTH_MAX from undate.dateformat.base import BaseDateFormat -#: duration of a single day -# ONE_DAY = datetime.timedelta(days=1) -ONE_DAY = np.timedelta64(1, "D") - - -class LocalDate(np.ndarray): - # shim to make np.datetime64 act more like datetime.date - - # extend np.datetime64 datatype - # adapted from https://stackoverflow.com/a/27129510/9706217 - - def __new__(cls, year: str, month: str = None, day: str = None): - if isinstance(year, np.datetime64): - data = year - else: - datestr = year - if month is not None: - datestr = f"{year}-{month:02d}" - if day is not None: - datestr = f"{datestr}-{day:02d}" - data = np.datetime64(datestr) - - data = np.asarray(data, dtype="datetime64") - if data.dtype != "datetime64[D]": - raise Exception( - "Unable to parse dates adequately to datetime64[D]: %s" % data - ) - obj = data.view(cls) - return obj - - def Export(self): - return self - - def __array_finalize__(self, obj): - if obj is None: - return - - # custom properties to access year, month, day - - @property - def year(self): - return int(str(self.astype("datetime64[Y]"))) - - @property - def month(self): - return int(str(self.astype("datetime64[M]")).split("-")[-1]) - - @property - def day(self): - return int(str(self.astype("datetime64[D]")).split("-")[-1]) - - -class DatePrecision(IntEnum): - """date precision, to indicate date precision independent from how much - of the date is known.""" - - # numbers should be set to allow logical greater than / less than - # comparison, e.g. year precision > month - - #: day - DAY = 1 - #: month - MONTH = 2 - #: year - YEAR = 3 - - def __str__(self): - return f"{self.name}" - - # numpy date units are years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’) - - class Undate: - """Simple object for representing uncertain, fuzzy or partially unknown dates""" + """object for representing uncertain, fuzzy or partially unknown dates""" DEFAULT_FORMAT: str = "ISO8601" @@ -201,8 +127,8 @@ def __init__( # self.earliest = datetime.date(min_year, min_month, min_day) # self.latest = datetime.date(max_year, max_month, max_day) - self.earliest = LocalDate(min_year, min_month, min_day) - self.latest = LocalDate(max_year, max_month, max_day) + self.earliest = Date(min_year, min_month, min_day) + self.latest = Date(max_year, max_month, max_day) if formatter is None: # import all subclass definitions; initialize the default @@ -376,9 +302,7 @@ def duration(self) -> datetime.timedelta: if not self.known_year: # if year is unknown, calculate month duration in # a single year - latest = LocalDate( - self.earliest.year, self.latest.month, self.latest.day - ) + latest = Date(self.earliest.year, self.latest.month, self.latest.day) # latest = datetime.date( # self.earliest.year, self.latest.month, self.latest.day @@ -390,7 +314,7 @@ def duration(self) -> datetime.timedelta: # if granularity == month but not known month, duration = 31 if delta.astype(int) > 31: # return datetime.timedelta(days=31) - return np.timedelta64(31, "D") + return ONE_MONTH_MAX return delta # otherwise, calculate based on earliest/latest range @@ -495,8 +419,7 @@ def duration(self) -> datetime.timedelta: # to the beginning of the next; # recalculate assuming second date is in the subsequent year if duration.astype("int") < 0: - # end = self.latest.earliest + relativedelta(years=1) - end = self.latest.earliest + np.timedelta64(365, "D") + end = self.latest.earliest + ONE_YEAR duration = end - self.earliest.earliest # add the additional day *after* checking for a negative From 1a756406cd2b28ce4637d5cb6c718e73db2e523f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 16 Aug 2024 17:01:40 -0400 Subject: [PATCH 05/15] Add unit tests for date class --- src/undate/date.py | 29 +++++++++++++++++++++-------- tests/test_date.py | 43 +++++++++++++++++++++++++++++++++++++++++++ tests/test_undate.py | 7 +------ 3 files changed, 65 insertions(+), 14 deletions(-) create mode 100644 tests/test_date.py diff --git a/src/undate/date.py b/src/undate/date.py index 4422a71..c6a095f 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -17,11 +17,11 @@ class Date(np.ndarray): # extend np.datetime64 datatype # adapted from https://stackoverflow.com/a/27129510/9706217 - def __new__(cls, year: str, month: str = None, day: str = None): + def __new__(cls, year: int, month: int = None, day: int = None): if isinstance(year, np.datetime64): data = year else: - datestr = year + datestr = str(year) if month is not None: datestr = f"{year}-{month:02d}" if day is not None: @@ -29,9 +29,18 @@ def __new__(cls, year: str, month: str = None, day: str = None): data = np.datetime64(datestr) data = np.asarray(data, dtype="datetime64") - if data.dtype != "datetime64[D]": + + # expected format depends on granularity / how much of date is known + expected_granularity = "Y" + if day is not None and month is not None: + expected_granularity = "D" + elif month: + expected_granularity = "M" + expected_dtype = f"datetime64[{expected_granularity}]" + + if data.dtype != expected_dtype: raise Exception( - "Unable to parse dates adequately to datetime64[D]: %s" % data + f"Unable to parse dates adequately as {expected_dtype}: {data}" ) obj = data.view(cls) return obj @@ -51,11 +60,15 @@ def year(self): @property def month(self): - return int(str(self.astype("datetime64[M]")).split("-")[-1]) + # if date unit is year, don't return a month (only M/D) + if not self.dtype == "datetime64[Y]": + return int(str(self.astype("datetime64[M]")).split("-")[-1]) @property def day(self): - return int(str(self.astype("datetime64[D]")).split("-")[-1]) + # only return a day if date unit is in days + if self.dtype == "datetime64[D]": + return int(str(self.astype("datetime64[D]")).split("-")[-1]) class DatePrecision(IntEnum): @@ -75,5 +88,5 @@ class DatePrecision(IntEnum): def __str__(self): return f"{self.name}" - # NOTE: consider harmonizing / using numpy units - # numpy date units are years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’) + # NOTE: consider harmonizing / using numpy date units: + # years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’) diff --git a/tests/test_date.py b/tests/test_date.py new file mode 100644 index 0000000..cb56790 --- /dev/null +++ b/tests/test_date.py @@ -0,0 +1,43 @@ +import numpy as np + +from undate.date import Date, DatePrecision, ONE_DAY, ONE_YEAR, ONE_MONTH_MAX + + +class TestDatePrecision: + def test_str(self): + assert str(DatePrecision.YEAR) == "YEAR" + + +class TestDate: + def test_init_year(self): + d = Date(2001) + assert isinstance(d, Date) + assert d.dtype == "datetime64[Y]" + assert str(d) == "2001" + + def test_init_year_month(self): + d = Date(2010, 5) + assert isinstance(d, Date) + assert d.dtype == "datetime64[M]" + assert str(d) == "2010-05" + + def test_init_year_month(self): + d = Date(2021, 6, 15) + assert isinstance(d, Date) + assert d.dtype == "datetime64[D]" + assert str(d) == "2021-06-15" + + def test_properties_year(self): + assert Date(2001).year == 2001 + assert Date(2010, 5).year == 2010 + assert Date(2021, 6, 15).year == 2021 + + def test_properties_month(self): + assert Date(2001).month is None + assert Date(2010, 5).month == 5 + assert Date(2021, 6, 15).month == 6 + + def test_properties_day(self): + assert Date(2001).day is None + assert Date(2010, 5).day == None + assert Date(2021, 6, 15).day == 15 diff --git a/tests/test_undate.py b/tests/test_undate.py index 7c4c577..e2ab201 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -2,12 +2,7 @@ import pytest -from undate.undate import Undate, UndateInterval, DatePrecision - - -class TestDatePrecision: - def test_str(self): - assert str(DatePrecision.YEAR) == "YEAR" +from undate.undate import Undate, UndateInterval class TestUndate: From 329fa3d67a90266db481c700018998089b618f45 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 16 Aug 2024 17:21:37 -0400 Subject: [PATCH 06/15] Update type hints --- pyproject.toml | 3 +++ src/undate/date.py | 22 +++++++++++++--------- src/undate/undate.py | 45 ++++++++++++++++++++++---------------------- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 374b58c..320e75f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,6 @@ requires = [ "wheel" ] build-backend = "setuptools.build_meta" + +[tool.mypy] +plugins = ["numpy.typing.mypy_plugin"] diff --git a/src/undate/date.py b/src/undate/date.py index c6a095f..349c919 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -1,5 +1,9 @@ from enum import IntEnum +# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None +from typing import Optional, Dict, Union + + import numpy as np #: timedelta for single day @@ -17,26 +21,26 @@ class Date(np.ndarray): # extend np.datetime64 datatype # adapted from https://stackoverflow.com/a/27129510/9706217 - def __new__(cls, year: int, month: int = None, day: int = None): + def __new__(cls, year: int, month: Optional[int] = None, day: Optional[int] = None): if isinstance(year, np.datetime64): - data = year + _data = year else: datestr = str(year) if month is not None: datestr = f"{year}-{month:02d}" if day is not None: datestr = f"{datestr}-{day:02d}" - data = np.datetime64(datestr) + _data = np.datetime64(datestr) - data = np.asarray(data, dtype="datetime64") + data = np.asarray(_data, dtype="datetime64") - # expected format depends on granularity / how much of date is known - expected_granularity = "Y" + # expected dtype depends on date unit / how much of date is known + expected_unit = "Y" if day is not None and month is not None: - expected_granularity = "D" + expected_unit = "D" elif month: - expected_granularity = "M" - expected_dtype = f"datetime64[{expected_granularity}]" + expected_unit = "M" + expected_dtype = f"datetime64[{expected_unit}]" if data.dtype != expected_dtype: raise Exception( diff --git a/src/undate/undate.py b/src/undate/undate.py index d0b8cd0..a7bc164 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -3,7 +3,10 @@ from calendar import monthrange # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Optional, Dict, Union +from typing import Optional, Dict, Union, Any + +import numpy as np +from numpy.typing import ArrayLike, DTypeLike from undate.date import Date, DatePrecision, ONE_DAY, ONE_YEAR, ONE_MONTH_MAX from undate.dateformat.base import BaseDateFormat @@ -17,8 +20,8 @@ class Undate: #: symbol for unknown digits within a date value MISSING_DIGIT: str = "X" - earliest: datetime.date - latest: datetime.date + earliest: Date + latest: Date #: A string to label a specific undate, e.g. "German Unity Date 2022" for Oct. 3, 2022. #: Labels are not taken into account when comparing undate objects. label: Union[str, None] = None @@ -64,11 +67,9 @@ def __init__( min_year = int(str(year).replace(self.MISSING_DIGIT, "0")) max_year = int(str(year).replace(self.MISSING_DIGIT, "9")) else: - # min_year = datetime.MINYEAR - # max_year = datetime.MAXYEAR - # numpy datetime is stored as 64-bit integer, so length - # depends on the span; assume days for now - + # numpy datetime is stored as 64-bit integer, so min/max + # depends on the time unit; assume days for now + # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units max_year = int(2.5e16) min_year = int(-2.5e16) @@ -76,7 +77,7 @@ def __init__( # treat as none # TODO: we should preserve this information somehow; # difference between just a year and and an unknown month within a year - # maybe in terms of granularity / size ? + # maybe in terms of date precision ? if month == "XX": month = None @@ -124,9 +125,6 @@ def __init__( # for unknowns, assume smallest possible value for earliest and # largest valid for latest - # self.earliest = datetime.date(min_year, min_month, min_day) - # self.latest = datetime.date(max_year, max_month, max_day) - self.earliest = Date(min_year, min_month, min_day) self.latest = Date(max_year, max_month, max_day) @@ -245,7 +243,7 @@ def __gt__(self, other: object) -> bool: # strictly greater than must rule out equals return not (self < other or self == other) - def __le__(self, other: Union["Undate", datetime.date]) -> bool: + def __le__(self, other: object) -> bool: return self == other or self < other def __contains__(self, other: object) -> bool: @@ -256,15 +254,17 @@ def __contains__(self, other: object) -> bool: if self == other: return False - return ( - self.earliest <= other.earliest - and self.latest >= other.latest - # is precision sufficient for comparing partially known dates? - and self.precision > other.precision + return all( + [ + self.earliest <= other.earliest, + self.latest >= other.latest, + # is precision sufficient for comparing partially known dates? + self.precision > other.precision, + ] ) @staticmethod - def from_datetime_date(dt_date): + def from_datetime_date(dt_date: datetime.date): """Initialize an :class:`Undate` object from a :class:`datetime.date`""" return Undate(dt_date.year, dt_date.month, dt_date.day) @@ -284,7 +284,7 @@ def is_known(self, part: str) -> bool: def is_partially_known(self, part: str) -> bool: return isinstance(self.initial_values[part], str) - def duration(self) -> datetime.timedelta: + def duration(self): # -> np.timedelta64: """What is the duration of this date? Calculate based on earliest and latest date within range, taking into account the precision of the date even if not all @@ -313,7 +313,6 @@ def duration(self) -> datetime.timedelta: # if granularity == month but not known month, duration = 31 if delta.astype(int) > 31: - # return datetime.timedelta(days=31) return ONE_MONTH_MAX return delta @@ -394,11 +393,11 @@ def __eq__(self, other) -> bool: # consider interval equal if both dates are equal return self.earliest == other.earliest and self.latest == other.latest - def duration(self) -> datetime.timedelta: + def duration(self): # -> np.timedelta64: """Calculate the duration between two undates. :returns: A duration - :rtype: timedelta + :rtype: numpy.timedelta64 """ # what is the duration of this date range? From 616ae2f1be95afad5f9a591845008e3e5d75c1e2 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 27 Sep 2024 11:43:48 -0400 Subject: [PATCH 07/15] Switch from black to ruff and fix imports --- .pre-commit-config.yaml | 8 +++++--- README.md | 2 +- src/undate/date.py | 3 +-- src/undate/dateformat/base.py | 3 +-- src/undate/dateformat/edtf/parser.py | 1 - src/undate/dateformat/edtf/transformer.py | 3 ++- src/undate/dateformat/iso8601.py | 3 ++- src/undate/undate.py | 4 ++-- tests/test_date.py | 5 ++--- tests/test_dateformat/edtf/test_edtf_parser.py | 1 - tests/test_dateformat/edtf/test_edtf_transformer.py | 3 +-- tests/test_dateformat/test_base.py | 1 - tests/test_undate.py | 3 +-- 13 files changed, 18 insertions(+), 22 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1091d07..f2e1151 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,11 @@ files: \.py repos: - - repo: https://github.com/psf/black - rev: 22.10.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.4 hooks: - - id: black + - id: ruff + args: [ --select, I, --fix, --exit-non-zero-on-fix ] + - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 hooks: diff --git a/README.md b/README.md index 519fe2a..3c7b56d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ It was initially created as part of a [DH-Tech](https://dh-tech.github.io/) hack [![Documentation Status](https://readthedocs.org/projects/undate-python/badge/?version=latest)](https://undate-python.readthedocs.io/en/latest/?badge=latest) [![unit tests](https://github.com/dh-tech/undate-python/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/dh-tech/undate-python/actions/workflows/unit_tests.yml) [![codecov](https://codecov.io/gh/dh-tech/undate-python/branch/main/graph/badge.svg?token=GE7HZE8C9D)](https://codecov.io/gh/dh-tech/undate-python) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![All Contributors](https://img.shields.io/badge/all_contributors-5-orange.svg?style=flat-square)](CONTRIBUTORS.md) diff --git a/src/undate/date.py b/src/undate/date.py index 349c919..d2935e8 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -1,8 +1,7 @@ from enum import IntEnum # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Optional, Dict, Union - +from typing import Dict, Optional, Union import numpy as np diff --git a/src/undate/dateformat/base.py b/src/undate/dateformat/base.py index f4435f4..46a25ba 100644 --- a/src/undate/dateformat/base.py +++ b/src/undate/dateformat/base.py @@ -15,9 +15,8 @@ import importlib import logging import pkgutil -from typing import Dict, Type from functools import lru_cache # functools.cache not available until 3.9 - +from typing import Dict, Type logger = logging.getLogger(__name__) diff --git a/src/undate/dateformat/edtf/parser.py b/src/undate/dateformat/edtf/parser.py index 8826b2d..6ab5139 100644 --- a/src/undate/dateformat/edtf/parser.py +++ b/src/undate/dateformat/edtf/parser.py @@ -2,7 +2,6 @@ from lark import Lark - grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") with open(grammar_path) as grammar: diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py index cca3609..2afab28 100644 --- a/src/undate/dateformat/edtf/transformer.py +++ b/src/undate/dateformat/edtf/transformer.py @@ -1,4 +1,5 @@ -from lark import Transformer, Tree, Token +from lark import Token, Transformer, Tree + from undate.undate import Undate, UndateInterval diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index 0f5cee7..0b3a3b5 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -1,6 +1,7 @@ +from typing import Dict, List, Union + from undate.dateformat.base import BaseDateFormat from undate.undate import Undate, UndateInterval -from typing import Dict, List, Union class ISO8601DateFormat(BaseDateFormat): diff --git a/src/undate/undate.py b/src/undate/undate.py index a7bc164..ce4dc8b 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -3,12 +3,12 @@ from calendar import monthrange # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Optional, Dict, Union, Any +from typing import Any, Dict, Optional, Union import numpy as np from numpy.typing import ArrayLike, DTypeLike -from undate.date import Date, DatePrecision, ONE_DAY, ONE_YEAR, ONE_MONTH_MAX +from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision from undate.dateformat.base import BaseDateFormat diff --git a/tests/test_date.py b/tests/test_date.py index cb56790..caa8fc8 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,6 +1,5 @@ import numpy as np - -from undate.date import Date, DatePrecision, ONE_DAY, ONE_YEAR, ONE_MONTH_MAX +from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision class TestDatePrecision: @@ -21,7 +20,7 @@ def test_init_year_month(self): assert d.dtype == "datetime64[M]" assert str(d) == "2010-05" - def test_init_year_month(self): + def test_init_year_month_day(self): d = Date(2021, 6, 15) assert isinstance(d, Date) assert d.dtype == "datetime64[D]" diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py index 5a2b8ea..3a2604b 100644 --- a/tests/test_dateformat/edtf/test_edtf_parser.py +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -1,5 +1,4 @@ import pytest - from undate.dateformat.edtf.parser import edtf_parser # for now, just test that valid dates can be parsed diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py index 3271b8b..821e42e 100644 --- a/tests/test_dateformat/edtf/test_edtf_transformer.py +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -1,8 +1,7 @@ import pytest - -from undate.undate import Undate, UndateInterval from undate.dateformat.edtf.parser import edtf_parser from undate.dateformat.edtf.transformer import EDTFTransformer +from undate.undate import Undate, UndateInterval # for now, just test that valid dates can be parsed diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py index 3687a37..26a9b97 100644 --- a/tests/test_dateformat/test_base.py +++ b/tests/test_dateformat/test_base.py @@ -1,7 +1,6 @@ import logging import pytest - from undate.dateformat.base import BaseDateFormat diff --git a/tests/test_undate.py b/tests/test_undate.py index e2ab201..b0891a9 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,7 +1,6 @@ -from datetime import timedelta, date +from datetime import date, timedelta import pytest - from undate.undate import Undate, UndateInterval From 2a7de40aba01687c1a43b9b9c9473c578e438d21 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 27 Sep 2024 12:55:04 -0400 Subject: [PATCH 08/15] Clean up based on code review and linters --- CONTRIBUTORS.md | 3 +- docs/conf.py | 2 +- pyproject.toml | 78 ++++++++++++++------------------ src/undate/date.py | 27 +++++++---- src/undate/dateformat/iso8601.py | 6 --- src/undate/undate.py | 10 ++-- tests/test_date.py | 14 +++++- 7 files changed, 70 insertions(+), 70 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3cfe1f6..415e17b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -46,5 +46,6 @@ We use [All Contributors](https://allcontributors.org/) because we recognize tha ### Related blog posts -- [by Rebecca Sutton Koeser](#blog-rlskoeser) +(blog-rlskoeser)= +#### [by Rebecca Sutton Koeser](#blog-rlskoeser) - [Join me for a DHTech hackathon? It’s an un-date!](https://dh-tech.github.io/blog/2023-02-09-hackathon-summary/) 2023-02-09 on DHTech blog diff --git a/docs/conf.py b/docs/conf.py index 1294ce3..8961d82 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,4 +86,4 @@ } # turn on relative links; make sure both github and sphinx links work -m2r_parse_relative_links = True +myst_enable_extensions = ["linkify"] diff --git a/pyproject.toml b/pyproject.toml index dff1b7f..3f3325b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,25 +6,29 @@ build-backend = "hatchling.build" name = "undate" description = "library for working with uncertain, fuzzy, or partially unknown dates and date intervals" readme = "README.md" -license = {text = "Apache-2"} +license = { text = "Apache-2" } requires-python = ">= 3.9" dynamic = ["version"] -dependencies = [ - "python-dateutil", - "lark", - "numpy" -] +dependencies = ["python-dateutil", "lark", "numpy"] authors = [ - {name = "Rebecca Sutton Koeser"}, - {name = "Cole Crawford"}, - {name = "Julia Damerow"}, - {name = "Robert Casties"}, - {name = "Malte Vogl"}, -# {name = "DHTech", email="dhtech.community@gmail.com"} ? + { name = "Rebecca Sutton Koeser" }, + { name = "Cole Crawford" }, + { name = "Julia Damerow" }, + { name = "Robert Casties" }, + { name = "Malte Vogl" }, + # {name = "DHTech", email="dhtech.community@gmail.com"} ? ] # currently no maintainers separate from authors -keywords = ["dates", "dating", "uncertainty", "uncertain-dates", "unknown", "partially-known", "digital-humanities"] +keywords = [ + "dates", + "dating", + "uncertainty", + "uncertain-dates", + "unknown", + "partially-known", + "digital-humanities", +] classifiers = [ "Development Status :: 2 - Pre-Alpha", "Programming Language :: Python :: 3", @@ -37,36 +41,25 @@ classifiers = [ "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Utilities", - "Typing :: Typed" + "Typing :: Typed", ] [project.optional-dependencies] -docs = [ - "sphinx>=7.0.0", - "alabaster", - "myst-parser" -] +docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"] dev = [ - "black>=22.10.0", - "pre-commit>=2.20.0", - "twine", - "wheel", - "build", - "mypy", - "treon", - "undate", - "undate[docs]", - "undate[test]", -] -test = [ - "pytest>=7.2", - "pytest-ordering", - "pytest-cov", -] -all = [ - "undate[dev]", - "undate[test]", + "black>=22.10.0", + "pre-commit>=2.20.0", + "twine", + "wheel", + "build", + "mypy", + "treon", + "undate", + "undate[docs]", + "undate[test]", ] +test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"] +all = ["undate[dev]", "undate[test]"] [project.urls] Homepage = "https://github.com/dh-tech/undate-python" @@ -80,19 +73,14 @@ Changelog = "https://github.com/dh-tech/undate/main/master/CHANGELOG.md" path = "src/undate/__init__.py" [tool.hatch.build.targets.sdist] -include = [ - "src/undate/**/*.py", - "src/undate/**/*.lark", - "/tests", -] +include = ["src/undate/**/*.py", "src/undate/**/*.lark", "/tests"] [tool.pytest.ini_options] pythonpath = "src/" markers = [ "last : run marked tests after all others", - "first : run marked tests before all others" + "first : run marked tests before all others", ] [tool.mypy] plugins = ["numpy.typing.mypy_plugin"] - diff --git a/src/undate/date.py b/src/undate/date.py index d2935e8..5f46678 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -1,7 +1,7 @@ from enum import IntEnum # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Dict, Optional, Union +from typing import Optional, Union import numpy as np @@ -20,7 +20,12 @@ class Date(np.ndarray): # extend np.datetime64 datatype # adapted from https://stackoverflow.com/a/27129510/9706217 - def __new__(cls, year: int, month: Optional[int] = None, day: Optional[int] = None): + def __new__( + cls, + year: Union[int, np.datetime64], + month: Optional[int] = None, + day: Optional[int] = None, + ): if isinstance(year, np.datetime64): _data = year else: @@ -42,7 +47,7 @@ def __new__(cls, year: int, month: Optional[int] = None, day: Optional[int] = No expected_dtype = f"datetime64[{expected_unit}]" if data.dtype != expected_dtype: - raise Exception( + raise ValueError( f"Unable to parse dates adequately as {expected_dtype}: {data}" ) obj = data.view(cls) @@ -64,7 +69,7 @@ def year(self): @property def month(self): # if date unit is year, don't return a month (only M/D) - if not self.dtype == "datetime64[Y]": + if self.dtype != "datetime64[Y]": return int(str(self.astype("datetime64[M]")).split("-")[-1]) @property @@ -78,15 +83,17 @@ class DatePrecision(IntEnum): """date precision, to indicate date precision independent from how much of the date is known.""" - # numbers should be set to allow logical greater than / less than - # comparison, e.g. year precision > month + # NOTE: values MUST be ordered based on the relative size or + # precison of the time unit. That is, the smaller the unit, the more precise + # it is: a day is more precise than a month, a month is more precise than a year, + # (DatePrecision.year < DatePrecision.month) - #: day - DAY = 1 + #: year + YEAR = 1 #: month MONTH = 2 - #: year - YEAR = 3 + #: day + DAY = 3 def __str__(self): return f"{self.name}" diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index 0b3a3b5..2366cc1 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -67,12 +67,6 @@ def to_string(self, undate: Undate) -> str: elif date_portion == "day": date_parts.append("%02d" % undate.earliest.day) - # else: - # # date_parts.append(undate.earliest.strftime(iso_format)) - # e = undate.earliest - # # isoformat defined above per field - # date_parts.append(f"{e.year:04d}") # -{e.month:02d}-{e.day:02d}") - # date_parts.append(undate.earliest.strftime(iso_format)) elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format date_parts.append("-") diff --git a/src/undate/undate.py b/src/undate/undate.py index ce4dc8b..47fe993 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -3,10 +3,7 @@ from calendar import monthrange # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Any, Dict, Optional, Union - -import numpy as np -from numpy.typing import ArrayLike, DTypeLike +from typing import Dict, Optional, Union from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision from undate.dateformat.base import BaseDateFormat @@ -259,7 +256,10 @@ def __contains__(self, other: object) -> bool: self.earliest <= other.earliest, self.latest >= other.latest, # is precision sufficient for comparing partially known dates? - self.precision > other.precision, + # checking based on less precise /less granular time unit, + # e.g. a day or month could be contained in a year + # but not the reverse + self.precision < other.precision, ] ) diff --git a/tests/test_date.py b/tests/test_date.py index caa8fc8..dcdb1a9 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,11 +1,15 @@ import numpy as np -from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision +from undate.date import Date, DatePrecision class TestDatePrecision: def test_str(self): assert str(DatePrecision.YEAR) == "YEAR" + def test_precision_comparison(self): + assert DatePrecision.DAY > DatePrecision.MONTH + assert DatePrecision.MONTH > DatePrecision.YEAR + class TestDate: def test_init_year(self): @@ -14,6 +18,12 @@ def test_init_year(self): assert d.dtype == "datetime64[Y]" assert str(d) == "2001" + def test_init_year_np_datetime64(self): + d = Date(np.datetime64("2024")) + assert isinstance(d, Date) + assert d.dtype == "datetime64[Y]" + assert str(d) == "2024" + def test_init_year_month(self): d = Date(2010, 5) assert isinstance(d, Date) @@ -38,5 +48,5 @@ def test_properties_month(self): def test_properties_day(self): assert Date(2001).day is None - assert Date(2010, 5).day == None + assert Date(2010, 5).day is None assert Date(2021, 6, 15).day == 15 From 2327d7b0fce259572471bc74cdf4d7ccbe905ee7 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 27 Sep 2024 15:23:28 -0400 Subject: [PATCH 09/15] Improve numpy duration & timedelta handling --- src/undate/date.py | 46 ++++++++++++++++++++++++++++++++++++++++---- src/undate/undate.py | 16 ++++++++++----- tests/test_date.py | 29 +++++++++++++++++++++++++++- tests/test_undate.py | 43 ++++++++++++++++++++--------------------- 4 files changed, 102 insertions(+), 32 deletions(-) diff --git a/src/undate/date.py b/src/undate/date.py index 5f46678..132384e 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -5,16 +5,40 @@ import numpy as np + +class Timedelta(np.ndarray): + """Convenience class to make :class:`numpy.timedelta64` act + more like the built-in python :class:`datetime.timedelta`.""" + + def __new__(cls, deltadays: Union[np.timedelta64, int]): + if isinstance(deltadays, int): + deltadays = np.timedelta64(deltadays, "D") + data = np.asarray(deltadays, dtype="timedelta64") + return data.view(cls) + + def Export(self): + return self + + def __array_finalize__(self, obj): + if obj is None: + return + + @property + def days(self) -> int: + """number of days, as an integer""" + return int(self.astype("datetime64[D]").astype("int")) + + #: timedelta for single day -ONE_DAY = np.timedelta64(1, "D") # ~ equivalent to datetime.timedelta(days=1) +ONE_DAY = Timedelta(1) # ~ equivalent to datetime.timedelta(days=1) #: timedelta for a single year (non-leap year) -ONE_YEAR = np.timedelta64(365, "D") # ~ relativedelta(years=1) +ONE_YEAR = Timedelta(365) # ~ relativedelta(years=1) #: timedelta for a month, assuming maximum month length (31 days) -ONE_MONTH_MAX = np.timedelta64(31, "D") +ONE_MONTH_MAX = Timedelta(31) class Date(np.ndarray): - """This class is a shim to make :class:`numpy.datetime64` act + """Convenience class to make :class:`numpy.datetime64` act more like the built-in python :class:`datetime.date`.""" # extend np.datetime64 datatype @@ -78,6 +102,20 @@ def day(self): if self.dtype == "datetime64[D]": return int(str(self.astype("datetime64[D]")).split("-")[-1]) + def __sub__(self, other): + # modify to conditionally return a timedelta object instead of a + # Date object with dtype timedelta64[D] (default behavior) + + result = super().__sub__(other) + # if the result has a timedelta type (i.e., date minus date = timedelta), + # cast to local Timedelta object; otherwise, leave as is + # (i.e., date minus timedelta = date) + if result.dtype == "timedelta64[D]": + result = Timedelta(result) + return result + + # NOTE: add should not be subclassed because we want to return a Date, not a delta + class DatePrecision(IntEnum): """date precision, to indicate date precision independent from how much diff --git a/src/undate/undate.py b/src/undate/undate.py index 47fe993..cb7d30a 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -28,6 +28,13 @@ class Undate: #: known non-leap year NON_LEAP_YEAR: int = 2022 + # numpy datetime is stored as 64-bit integer, so min/max + # depends on the time unit; assume days for now + # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units + # It just so happens that int(2.5e16) is a leap year, which is a weird default, + # so let's increase our lower bound by one year. + MIN_ALLOWABLE_YEAR = int(2.5e16) + 1 + MAX_ALLOWABLE_YEAR = int(-2.5e16) def __init__( self, @@ -64,11 +71,10 @@ def __init__( min_year = int(str(year).replace(self.MISSING_DIGIT, "0")) max_year = int(str(year).replace(self.MISSING_DIGIT, "9")) else: - # numpy datetime is stored as 64-bit integer, so min/max - # depends on the time unit; assume days for now - # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units - max_year = int(2.5e16) - min_year = int(-2.5e16) + # use the configured min/max allowable years if we + # don't have any other bounds + max_year = self.MIN_ALLOWABLE_YEAR + min_year = self.MAX_ALLOWABLE_YEAR # if month is passed in as a string but completely unknown, # treat as none diff --git a/tests/test_date.py b/tests/test_date.py index dcdb1a9..5ff017d 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,5 +1,5 @@ import numpy as np -from undate.date import Date, DatePrecision +from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta class TestDatePrecision: @@ -50,3 +50,30 @@ def test_properties_day(self): assert Date(2001).day is None assert Date(2010, 5).day is None assert Date(2021, 6, 15).day == 15 + + def test_substract(self): + # date - date = timedelta + date_difference = Date(2024, 1, 2) - Date(2024, 1, 1) + assert isinstance(date_difference, Timedelta) + assert date_difference.days == 1 + + # date - timedelta = date + year_prior = Date(2024, 1, 2) - ONE_YEAR + assert isinstance(year_prior, Date) + + +class TestTimeDelta: + def test_init_from_int(self): + td = Timedelta(31) + assert isinstance(td, Timedelta) + assert td.dtype == "timedelta64[D]" + assert td.astype("int") == 31 + + def test_init_from_np_timedelta64(self): + td = Timedelta(np.timedelta64(12, "D")) + assert isinstance(td, Timedelta) + assert td.dtype == "timedelta64[D]" + assert td.astype("int") == 12 + + def test_days(self): + assert Timedelta(10).days == 10 diff --git a/tests/test_undate.py b/tests/test_undate.py index b0891a9..9e81f97 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,6 +1,8 @@ from datetime import date, timedelta +import numpy as np import pytest +from undate.date import Timedelta from undate.undate import Undate, UndateInterval @@ -285,35 +287,35 @@ def test_sorting(self): def test_duration(self): day_duration = Undate(2022, 11, 7).duration() - # assert isinstance(day_duration, timedelta) - assert day_duration.astype("int") == 1 + assert isinstance(day_duration, Timedelta) + assert day_duration.days == 1 january_duration = Undate(2022, 1).duration() - assert january_duration.astype("int") == 31 + assert january_duration.days == 31 feb_duration = Undate(2022, 2).duration() - assert feb_duration.astype("int") == 28 + assert feb_duration.days == 28 # next leap year will be 2024 leapyear_feb_duration = Undate(2024, 2).duration() - assert leapyear_feb_duration.astype("int") == 29 + assert leapyear_feb_duration.days == 29 year_duration = Undate(2022).duration() - assert year_duration.astype("int") == 365 + assert year_duration.days == 365 leapyear_duration = Undate(2024).duration() - assert leapyear_duration.astype("int") == 366 + assert leapyear_duration.days == 366 def test_partiallyknown_duration(self): # day in unknown month/year # assert Undate(day=5).duration().days == 1 - assert Undate(day=5).duration().astype("int") == 1 - assert Undate(year=1900, month=11, day="2X").duration().astype("int") == 1 + assert Undate(day=5).duration().days == 1 + assert Undate(year=1900, month=11, day="2X").duration().days == 1 # month in unknown year - assert Undate(month=6).duration().astype("int") == 30 + assert Undate(month=6).duration().days == 30 # partially known month - assert Undate(year=1900, month="1X").duration().astype("int") == 31 + assert Undate(year=1900, month="1X").duration().days == 31 # what about february? # could vary with leap years, but assume non-leapyear - assert Undate(month=2).duration().astype("int") == 28 + assert Undate(month=2).duration().days == 28 def test_known_year(self): assert Undate(2022).known_year is True @@ -393,37 +395,34 @@ def test_duration(self): week_duration = UndateInterval( Undate(2022, 11, 1), Undate(2022, 11, 7) ).duration() - # assert isinstance(week_duration, timedelta) - assert week_duration.astype("int") == 7 + assert isinstance(week_duration, Timedelta) + assert week_duration.days == 7 twomonths = UndateInterval(Undate(2022, 11), Undate(2022, 12)).duration() # november - december = 30 days + 31 days - assert twomonths.astype("int") == 30 + 31 + assert twomonths.days == 30 + 31 twoyears = UndateInterval(Undate(2021), Undate(2022)).duration() - assert twoyears.astype("int") == 365 * 2 + assert twoyears.days == 365 * 2 # special case: month/day with no year (assumes same year) week_noyear_duration = UndateInterval( Undate(None, 11, 1), Undate(None, 11, 7) ).duration() - assert week_noyear_duration.astype("int") == 7 + assert week_noyear_duration.days == 7 # special case 2: month/day with no year, wrapping from december to january # (assumes sequential years) month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.astype("int") == 31 - # change from relativedelta to timedelta64 changes this for some reason - # assert month_noyear_duration.astype("int") == 32 - # this seems wrong, but we currently count both start and dates + assert month_noyear_duration.days == 31 # real case from Shakespeare and Company Project data; # second date is a year minus one day in the future month_noyear_duration = UndateInterval( Undate(None, 6, 7), Undate(None, 6, 6) ).duration() - assert month_noyear_duration.astype("int") == 365 + assert month_noyear_duration.days == 365 # duration is not supported for open-ended intervals assert UndateInterval(Undate(2000), None).duration() == NotImplemented From 039ff0e59fb0c13ebd304ae919304e8958c26bfa Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 27 Sep 2024 16:01:45 -0400 Subject: [PATCH 10/15] Improve duration handling for unknown years --- .../shxco_partial_date_durations.ipynb | 628 ++++++++---------- src/undate/undate.py | 21 +- tests/test_undate.py | 21 +- 3 files changed, 316 insertions(+), 354 deletions(-) diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb index 8d00a66..b89661f 100644 --- a/examples/notebooks/shxco_partial_date_durations.ipynb +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -29,8 +29,8 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate/bin/python -m pip install --upgrade pip\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate-py3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -311,17 +311,18 @@ "\n", "Define a method to initialize an `UndateInterval` from start and end date strings in ISO format as used in S&co datasets\n", "\n", - "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code; becauS&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." + "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code. This is because S&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "y_MqgrQW64uI" }, "outputs": [], "source": [ + "from undate.date import ONE_DAY\n", "from undate.undate import UndateInterval\n", "from undate.dateformat.iso8601 import ISO8601DateFormat\n", "\n", @@ -333,9 +334,8 @@ " interval = UndateInterval(earliest=unstart, latest=unend)\n", "\n", " # subtract one here for simplicity of comparison,\n", - " # to reconcile difference between how duration logic\n", - "\n", - " return interval.duration().days - 1" + " # to reconcile differences between duration logic\n", + " return interval.duration() - ONE_DAY" ] }, { @@ -353,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -452,7 +452,7 @@ "260 4 months 122.0 " ] }, - "execution_count": 18, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -497,15 +497,15 @@ "91.0 397\n", "365.0 337\n", " ... \n", - "69.0 1\n", - "36.0 1\n", - "73.0 1\n", - "574.0 1\n", - "171.0 1\n", + "200.0 1\n", + "277.0 1\n", + "169.0 1\n", + "45.0 1\n", + "38.0 1\n", "Name: count, Length: 133, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -517,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -540,7 +540,7 @@ "Name: subscription_duration_days, dtype: float64" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -558,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -589,44 +589,25 @@ " \n", " \n", " \n", - " event_type\n", + " member_names\n", " start_date\n", " end_date\n", - " member_uris\n", - " member_names\n", - " member_sort_names\n", - " subscription_price_paid\n", - " subscription_deposit\n", " subscription_duration\n", " subscription_duration_days\n", - " ...\n", - " item_uri\n", - " item_title\n", - " item_volume\n", - " item_authors\n", - " item_year\n", - " item_notes\n", - " source_type\n", - " source_citation\n", - " source_manifest\n", - " source_image\n", " \n", " \n", " \n", " \n", "\n", - "

0 rows × 28 columns

\n", "" ], "text/plain": [ "Empty DataFrame\n", - "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", - "Index: []\n", - "\n", - "[0 rows x 28 columns]" + "Columns: [member_names, start_date, end_date, subscription_duration, subscription_duration_days]\n", + "Index: []" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -707,7 +688,7 @@ "13686 NaN 31.0 " ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -726,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": { "id": "jwvN9-CgLQRx" }, @@ -746,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -793,7 +774,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 70\n", @@ -802,7 +783,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 233\n", @@ -811,7 +792,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " \n", " \n", " 234\n", @@ -820,7 +801,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " \n", " \n", " 260\n", @@ -829,7 +810,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " \n", " \n", "\n", @@ -843,15 +824,15 @@ "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", "260 Victor Llona 1923-06 1923-10 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 " + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days " ] }, - "execution_count": 21, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -864,7 +845,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -911,7 +892,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 70\n", @@ -920,7 +901,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 233\n", @@ -929,7 +910,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " \n", " \n", " 234\n", @@ -938,7 +919,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " \n", " \n", " 260\n", @@ -947,7 +928,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " \n", " \n", "\n", @@ -961,15 +942,15 @@ "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", "260 Victor Llona 1923-06 1923-10 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 " + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days " ] }, - "execution_count": 23, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -981,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1029,7 +1010,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1039,7 +1020,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1049,7 +1030,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1059,7 +1040,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1069,7 +1050,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1089,7 +1070,7 @@ " 1941-12-24\n", " 1 month\n", " 30.0\n", - " 30\n", + " 30 days\n", " 0.0\n", " \n", " \n", @@ -1099,7 +1080,7 @@ " 1941-12-24\n", " 1 month\n", " 30.0\n", - " 30\n", + " 30 days\n", " 0.0\n", " \n", " \n", @@ -1109,7 +1090,7 @@ " 1942-01-04\n", " 1 month\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -1119,7 +1100,7 @@ " 1942-03-08\n", " 3 months\n", " 90.0\n", - " 90\n", + " 90 days\n", " 0.0\n", " \n", " \n", @@ -1129,7 +1110,7 @@ " 1942-01-09\n", " 1 month\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -1164,36 +1145,36 @@ "35118 1942-03-08 3 months 90.0 \n", "35119 1942-01-09 1 month 31.0 \n", "\n", - " undate_duration duration_diff \n", - "28 730 365.0 \n", - "70 730 365.0 \n", - "233 61 30.0 \n", - "234 180 27.0 \n", - "260 152 30.0 \n", - "... ... ... \n", - "35114 30 0.0 \n", - "35115 30 0.0 \n", - "35116 31 0.0 \n", - "35118 90 0.0 \n", - "35119 31 0.0 \n", + " undate_duration duration_diff \n", + "28 730 days 365.0 \n", + "70 730 days 365.0 \n", + "233 61 days 30.0 \n", + "234 180 days 27.0 \n", + "260 152 days 30.0 \n", + "... ... ... \n", + "35114 30 days 0.0 \n", + "35115 30 days 0.0 \n", + "35116 31 days 0.0 \n", + "35118 90 days 0.0 \n", + "35119 31 days 0.0 \n", "\n", "[9144 rows x 7 columns]" ] }, - "execution_count": 24, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what's the difference between the two?\n", - "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days, axis=1)\n", "subs_duration" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1206,20 +1187,20 @@ "data": { "text/plain": [ "duration_diff\n", - " 0.0 9065\n", - " 30.0 30\n", - " 29.0 21\n", - " 1.0 10\n", - "-1.0 9\n", - " 28.0 4\n", - " 365.0 2\n", - " 27.0 1\n", - " 2.0 1\n", - "-3.0 1\n", + "0.0 9065\n", + "30.0 30\n", + "29.0 21\n", + "1.0 10\n", + "-1.0 9\n", + "28.0 4\n", + "365.0 2\n", + "27.0 1\n", + "2.0 1\n", + "-3.0 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1239,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1287,7 +1268,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1297,7 +1278,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1307,7 +1288,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1317,7 +1298,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1327,7 +1308,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1337,7 +1318,7 @@ " 1923-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1347,7 +1328,7 @@ " 1924-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1357,7 +1338,7 @@ " 1924-04\n", " 2 months\n", " 60.0\n", - " 89\n", + " 89 days\n", " 29.0\n", " \n", " \n", @@ -1367,7 +1348,7 @@ " 1926-10\n", " 7 months\n", " 214.0\n", - " 244\n", + " 244 days\n", " 30.0\n", " \n", " \n", @@ -1377,7 +1358,7 @@ " 1926-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1397,32 +1378,32 @@ "293 Madeleine Lorsignol 1926-03 1926-10 \n", "313 M. Mathieu 1926-11 1926-12 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \\\n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 \n", - "261 1 month 31.0 60 \n", - "271 1 month 29.0 59 \n", - "272 2 months 60.0 89 \n", - "293 7 months 214.0 244 \n", - "313 1 month 30.0 60 \n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days \n", + "261 1 month 31.0 60 days \n", + "271 1 month 29.0 59 days \n", + "272 2 months 60.0 89 days \n", + "293 7 months 214.0 244 days \n", + "313 1 month 30.0 60 days \n", "\n", - " duration_diff \n", - "28 365.0 \n", - "70 365.0 \n", - "233 30.0 \n", - "234 27.0 \n", - "260 30.0 \n", - "261 29.0 \n", - "271 30.0 \n", - "272 29.0 \n", - "293 30.0 \n", - "313 30.0 " + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "233 30.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "261 29.0 \n", + "271 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "313 30.0 " ] }, - "execution_count": 41, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1435,7 +1416,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1455,14 +1436,14 @@ "4 months 5\n", "5 months 3\n", "1 year 2\n", - "7 months 2\n", "8 months 2\n", + "7 months 2\n", "11 months 1\n", "10 months 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1474,7 +1455,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1522,7 +1503,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1532,7 +1513,7 @@ " 1923-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1542,7 +1523,7 @@ " 1924-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1552,7 +1533,7 @@ " 1926-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1562,7 +1543,7 @@ " 1928-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1572,7 +1553,7 @@ " 1928-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1582,7 +1563,7 @@ " 1929-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1592,7 +1573,7 @@ " 1929-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1602,7 +1583,7 @@ " 1930-06\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1612,7 +1593,7 @@ " 1930-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1622,7 +1603,7 @@ " 1931-06\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1632,7 +1613,7 @@ " 1931-07\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1642,7 +1623,7 @@ " 1931-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1652,7 +1633,7 @@ " 1931-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1662,7 +1643,7 @@ " 1931-10\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1687,25 +1668,25 @@ "468 Elaine Cammett 1931-08 1931-09 1 month \n", "472 Frederick McWilliam 1931-09 1931-10 1 month \n", "\n", - " subscription_duration_days undate_duration duration_diff \n", - "233 31.0 61 30.0 \n", - "261 31.0 60 29.0 \n", - "271 29.0 59 30.0 \n", - "313 30.0 60 30.0 \n", - "354 29.0 59 30.0 \n", - "356 29.0 59 30.0 \n", - "393 31.0 60 29.0 \n", - "394 31.0 60 29.0 \n", - "430 31.0 60 29.0 \n", - "444 30.0 60 30.0 \n", - "462 31.0 60 29.0 \n", - "464 30.0 60 30.0 \n", - "466 31.0 61 30.0 \n", - "468 31.0 60 29.0 \n", - "472 30.0 60 30.0 " + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 days 30.0 \n", + "261 31.0 60 days 29.0 \n", + "271 29.0 59 days 30.0 \n", + "313 30.0 60 days 30.0 \n", + "354 29.0 59 days 30.0 \n", + "356 29.0 59 days 30.0 \n", + "393 31.0 60 days 29.0 \n", + "394 31.0 60 days 29.0 \n", + "430 31.0 60 days 29.0 \n", + "444 30.0 60 days 30.0 \n", + "462 31.0 60 days 29.0 \n", + "464 30.0 60 days 30.0 \n", + "466 31.0 61 days 30.0 \n", + "468 31.0 60 days 29.0 \n", + "472 30.0 60 days 30.0 " ] }, - "execution_count": 43, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1728,7 +1709,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1776,7 +1757,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1786,7 +1767,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1796,7 +1777,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1806,7 +1787,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1816,7 +1797,7 @@ " 1924-04\n", " 2 months\n", " 60.0\n", - " 89\n", + " 89 days\n", " 29.0\n", " \n", " \n", @@ -1826,7 +1807,7 @@ " 1926-10\n", " 7 months\n", " 214.0\n", - " 244\n", + " 244 days\n", " 30.0\n", " \n", " \n", @@ -1836,7 +1817,7 @@ " 1928-02\n", " 11 months\n", " 337.0\n", - " 365\n", + " 365 days\n", " 28.0\n", " \n", " \n", @@ -1846,7 +1827,7 @@ " 1927-10\n", " 3 months\n", " 92.0\n", - " 122\n", + " 122 days\n", " 30.0\n", " \n", " \n", @@ -1856,7 +1837,7 @@ " 1928-06\n", " 8 months\n", " 244.0\n", - " 273\n", + " 273 days\n", " 29.0\n", " \n", " \n", @@ -1866,7 +1847,7 @@ " 1928-04\n", " 3 months\n", " 91.0\n", - " 120\n", + " 120 days\n", " 29.0\n", " \n", " \n", @@ -1876,7 +1857,7 @@ " 1930-04\n", " 10 months\n", " 304.0\n", - " 333\n", + " 333 days\n", " 29.0\n", " \n", " \n", @@ -1886,7 +1867,7 @@ " 1930-04\n", " 3 months\n", " 90.0\n", - " 119\n", + " 119 days\n", " 29.0\n", " \n", " \n", @@ -1896,7 +1877,7 @@ " 1930-04\n", " 3 months\n", " 90.0\n", - " 119\n", + " 119 days\n", " 29.0\n", " \n", " \n", @@ -1906,7 +1887,7 @@ " 1930-09\n", " 8 months\n", " 243.0\n", - " 272\n", + " 272 days\n", " 29.0\n", " \n", " \n", @@ -1916,7 +1897,7 @@ " 1930-06\n", " 4 months\n", " 120.0\n", - " 149\n", + " 149 days\n", " 29.0\n", " \n", " \n", @@ -1941,42 +1922,42 @@ "412 Jacques Delmond 1930-01 1930-09 \n", "415 Loren Mozley 1930-02 1930-06 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \\\n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 \n", - "272 2 months 60.0 89 \n", - "293 7 months 214.0 244 \n", - "321 11 months 337.0 365 \n", - "331 3 months 92.0 122 \n", - "337 8 months 244.0 273 \n", - "349 3 months 91.0 120 \n", - "388 10 months 304.0 333 \n", - "408 3 months 90.0 119 \n", - "409 3 months 90.0 119 \n", - "412 8 months 243.0 272 \n", - "415 4 months 120.0 149 \n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days \n", + "272 2 months 60.0 89 days \n", + "293 7 months 214.0 244 days \n", + "321 11 months 337.0 365 days \n", + "331 3 months 92.0 122 days \n", + "337 8 months 244.0 273 days \n", + "349 3 months 91.0 120 days \n", + "388 10 months 304.0 333 days \n", + "408 3 months 90.0 119 days \n", + "409 3 months 90.0 119 days \n", + "412 8 months 243.0 272 days \n", + "415 4 months 120.0 149 days \n", "\n", - " duration_diff \n", - "28 365.0 \n", - "70 365.0 \n", - "234 27.0 \n", - "260 30.0 \n", - "272 29.0 \n", - "293 30.0 \n", - "321 28.0 \n", - "331 30.0 \n", - "337 29.0 \n", - "349 29.0 \n", - "388 29.0 \n", - "408 29.0 \n", - "409 29.0 \n", - "412 29.0 \n", - "415 29.0 " + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "321 28.0 \n", + "331 30.0 \n", + "337 29.0 \n", + "349 29.0 \n", + "388 29.0 \n", + "408 29.0 \n", + "409 29.0 \n", + "412 29.0 \n", + "415 29.0 " ] }, - "execution_count": 44, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2001,7 +1982,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2087,7 +2068,7 @@ "606 G. E. Pulsford --01-20 --01-28 8.0" ] }, - "execution_count": 32, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2101,7 +2082,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2187,7 +2168,7 @@ "29908 Ann Samyn 1961-10-04 1962-03-21 168.0" ] }, - "execution_count": 33, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2198,7 +2179,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2243,7 +2224,7 @@ " --01-07\n", " --01-13\n", " 6.0\n", - " 6\n", + " 6 days\n", " \n", " \n", " 603\n", @@ -2251,7 +2232,7 @@ " --01-12\n", " --01-20\n", " 8.0\n", - " 8\n", + " 8 days\n", " \n", " \n", " 604\n", @@ -2259,7 +2240,7 @@ " --01-16\n", " --02-16\n", " 31.0\n", - " 31\n", + " 31 days\n", " \n", " \n", " 605\n", @@ -2267,7 +2248,7 @@ " --01-19\n", " --01-24\n", " 5.0\n", - " 5\n", + " 5 days\n", " \n", " \n", " 606\n", @@ -2275,7 +2256,7 @@ " --01-20\n", " --01-28\n", " 8.0\n", - " 8\n", + " 8 days\n", " \n", " \n", " 607\n", @@ -2283,7 +2264,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 608\n", @@ -2291,7 +2272,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 609\n", @@ -2299,7 +2280,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 610\n", @@ -2307,7 +2288,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " \n", " \n", " 611\n", @@ -2315,27 +2296,27 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " \n", " \n", "\n", "" ], "text/plain": [ - " member_names start_date end_date borrow_duration_days undate_duration\n", - "602 G. E. Pulsford --01-07 --01-13 6.0 6\n", - "603 G. E. Pulsford --01-12 --01-20 8.0 8\n", - "604 Robert D. Sage --01-16 --02-16 31.0 31\n", - "605 Gertrude Stein --01-19 --01-24 5.0 5\n", - "606 G. E. Pulsford --01-20 --01-28 8.0 8\n", - "607 Gertrude Stein --01-24 --03-20 55.0 55\n", - "608 Gertrude Stein --01-24 --03-20 55.0 55\n", - "609 Gertrude Stein --01-24 --03-20 55.0 55\n", - "610 Gertrude Stein --01-24 --05-30 126.0 126\n", - "611 Gertrude Stein --01-24 --05-30 126.0 126" + " member_names start_date end_date borrow_duration_days undate_duration\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6 days\n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8 days\n", + "604 Robert D. Sage --01-16 --02-16 31.0 31 days\n", + "605 Gertrude Stein --01-19 --01-24 5.0 5 days\n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8 days\n", + "607 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "608 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "609 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "610 Gertrude Stein --01-24 --05-30 126.0 126 days\n", + "611 Gertrude Stein --01-24 --05-30 126.0 126 days" ] }, - "execution_count": 34, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2348,7 +2329,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2394,7 +2375,7 @@ " --01-07\n", " --01-13\n", " 6.0\n", - " 6\n", + " 6 days\n", " 0.0\n", " \n", " \n", @@ -2403,7 +2384,7 @@ " --01-12\n", " --01-20\n", " 8.0\n", - " 8\n", + " 8 days\n", " 0.0\n", " \n", " \n", @@ -2412,7 +2393,7 @@ " --01-16\n", " --02-16\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -2421,7 +2402,7 @@ " --01-19\n", " --01-24\n", " 5.0\n", - " 5\n", + " 5 days\n", " 0.0\n", " \n", " \n", @@ -2430,7 +2411,7 @@ " --01-20\n", " --01-28\n", " 8.0\n", - " 8\n", + " 8 days\n", " 0.0\n", " \n", " \n", @@ -2439,7 +2420,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2448,7 +2429,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2457,7 +2438,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2466,7 +2447,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " 0.0\n", " \n", " \n", @@ -2475,7 +2456,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " 0.0\n", " \n", " \n", @@ -2483,45 +2464,45 @@ "" ], "text/plain": [ - " member_names start_date end_date borrow_duration_days \\\n", - "602 G. E. Pulsford --01-07 --01-13 6.0 \n", - "603 G. E. Pulsford --01-12 --01-20 8.0 \n", - "604 Robert D. Sage --01-16 --02-16 31.0 \n", - "605 Gertrude Stein --01-19 --01-24 5.0 \n", - "606 G. E. Pulsford --01-20 --01-28 8.0 \n", - "607 Gertrude Stein --01-24 --03-20 55.0 \n", - "608 Gertrude Stein --01-24 --03-20 55.0 \n", - "609 Gertrude Stein --01-24 --03-20 55.0 \n", - "610 Gertrude Stein --01-24 --05-30 126.0 \n", - "611 Gertrude Stein --01-24 --05-30 126.0 \n", + " member_names start_date end_date borrow_duration_days undate_duration \\\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6 days \n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8 days \n", + "604 Robert D. Sage --01-16 --02-16 31.0 31 days \n", + "605 Gertrude Stein --01-19 --01-24 5.0 5 days \n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8 days \n", + "607 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "608 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "609 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "610 Gertrude Stein --01-24 --05-30 126.0 126 days \n", + "611 Gertrude Stein --01-24 --05-30 126.0 126 days \n", "\n", - " undate_duration duration_diff \n", - "602 6 0.0 \n", - "603 8 0.0 \n", - "604 31 0.0 \n", - "605 5 0.0 \n", - "606 8 0.0 \n", - "607 55 0.0 \n", - "608 55 0.0 \n", - "609 55 0.0 \n", - "610 126 0.0 \n", - "611 126 0.0 " + " duration_diff \n", + "602 0.0 \n", + "603 0.0 \n", + "604 0.0 \n", + "605 0.0 \n", + "606 0.0 \n", + "607 0.0 \n", + "608 0.0 \n", + "609 0.0 \n", + "610 0.0 \n", + "611 0.0 " ] }, - "execution_count": 36, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what's the difference between the two?\n", - "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1)\n", "borrow_duration.head(10)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2538,7 +2519,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 37, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2556,14 +2537,21 @@ "source": [ "Woohoo, everything matches! 🎉\n", "\n", + "* * * \n", + "\n", "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", "\n", - "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#).\n", + "\n", + "* * * \n", + "\n", + "In a preliminary implementation of the numpy datetime64 integration, the new earliest possible year turned out to be a leap year, resulting in the counts for Gertrude Stein's borrows from January to March to be off by one. This was corrected by adjusting the minimum year by one to ensure it is not a leap year.\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2572,54 +2560,10 @@ "id": "-Bq76gtDWljg", "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
member_namesstart_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [member_names, start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", - "Index: []" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "borrow_duration[borrow_duration.duration_diff != 0]" + "# Confirm that we have no mismatches\n", + "assert len(borrow_duration[borrow_duration.duration_diff != 0]) == 0" ] } ], @@ -2648,7 +2592,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/src/undate/undate.py b/src/undate/undate.py index cb7d30a..bf248ff 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -5,7 +5,7 @@ # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Dict, Optional, Union -from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision +from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision, Timedelta from undate.dateformat.base import BaseDateFormat @@ -33,8 +33,8 @@ class Undate: # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units # It just so happens that int(2.5e16) is a leap year, which is a weird default, # so let's increase our lower bound by one year. - MIN_ALLOWABLE_YEAR = int(2.5e16) + 1 - MAX_ALLOWABLE_YEAR = int(-2.5e16) + MIN_ALLOWABLE_YEAR = int(-2.5e16) + 1 + MAX_ALLOWABLE_YEAR = int(2.5e16) def __init__( self, @@ -73,8 +73,8 @@ def __init__( else: # use the configured min/max allowable years if we # don't have any other bounds - max_year = self.MIN_ALLOWABLE_YEAR - min_year = self.MAX_ALLOWABLE_YEAR + min_year = self.MIN_ALLOWABLE_YEAR + max_year = self.MAX_ALLOWABLE_YEAR # if month is passed in as a string but completely unknown, # treat as none @@ -126,6 +126,9 @@ def __init__( if day is not None: min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) + # TODO: special case, if we get a Feb 29 date with unknown year, + # must switch the min/max years to known leap years! + # for unknowns, assume smallest possible value for earliest and # largest valid for latest self.earliest = Date(min_year, min_month, min_day) @@ -290,7 +293,7 @@ def is_known(self, part: str) -> bool: def is_partially_known(self, part: str) -> bool: return isinstance(self.initial_values[part], str) - def duration(self): # -> np.timedelta64: + def duration(self) -> Timedelta: """What is the duration of this date? Calculate based on earliest and latest date within range, taking into account the precision of the date even if not all @@ -399,11 +402,11 @@ def __eq__(self, other) -> bool: # consider interval equal if both dates are equal return self.earliest == other.earliest and self.latest == other.latest - def duration(self): # -> np.timedelta64: + def duration(self) -> Timedelta: """Calculate the duration between two undates. :returns: A duration - :rtype: numpy.timedelta64 + :rtype: Timedelta """ # what is the duration of this date range? @@ -423,7 +426,7 @@ def duration(self): # -> np.timedelta64: # if we get a negative, we've wrapped from end of one year # to the beginning of the next; # recalculate assuming second date is in the subsequent year - if duration.astype("int") < 0: + if duration.days < 0: end = self.latest.earliest + ONE_YEAR duration = end - self.earliest.earliest diff --git a/tests/test_undate.py b/tests/test_undate.py index 9e81f97..c1d3792 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,3 +1,4 @@ +import calendar from datetime import date, timedelta import numpy as np @@ -113,6 +114,10 @@ def test_init_partially_known_day(self): uncertain_day = Undate(2024, 2, "2X") assert uncertain_day.latest.day == 29 + # TODO: handle leap day in an unknown year + # (currently causes an exception because min/max years are not leap years) + # Undate(None, 2, 29) + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") @@ -294,7 +299,7 @@ def test_duration(self): assert january_duration.days == 31 feb_duration = Undate(2022, 2).duration() assert feb_duration.days == 28 - # next leap year will be 2024 + # 2024 is a known leap year leapyear_feb_duration = Undate(2024, 2).duration() assert leapyear_feb_duration.days == 29 @@ -391,6 +396,9 @@ def test_not_eq(self): ) assert UndateInterval(Undate(2022, 5)) != UndateInterval(Undate(2022, 6)) + def test_min_year_non_leapyear(self): + assert not calendar.isleap(Undate.MIN_ALLOWABLE_YEAR) + def test_duration(self): week_duration = UndateInterval( Undate(2022, 11, 1), Undate(2022, 11, 7) @@ -415,14 +423,21 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.days == 31 + assert month_noyear_duration.days == 32 - # real case from Shakespeare and Company Project data; + # real world test cases from Shakespeare and Company Project data; # second date is a year minus one day in the future month_noyear_duration = UndateInterval( Undate(None, 6, 7), Undate(None, 6, 6) ).duration() assert month_noyear_duration.days == 365 + # durations that span february in unknown years should assume + # non-leap years + jan_march_duration = UndateInterval( + Undate(None, 2, 28), Undate(None, 3, 1) + ).duration() + assert jan_march_duration.days == 2 + # duration is not supported for open-ended intervals assert UndateInterval(Undate(2000), None).duration() == NotImplemented From 8a4b67edfebfe01746c401fd7f0f040ec0acda73 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 7 Nov 2024 17:01:51 -0500 Subject: [PATCH 11/15] Clean items flagged by @ColeDCrawford in code review --- .github/workflows/unit_tests.yml | 4 ++-- .pythonversion | 1 + DEVELOPER_NOTES.md | 14 ++++++++------ pyproject.toml | 5 ++--- src/undate/date.py | 18 ++++++++++++------ src/undate/dateformat/base.py | 4 ++-- 6 files changed, 27 insertions(+), 19 deletions(-) create mode 100644 .pythonversion diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 5fdc16b..456fe8a 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,14 +11,14 @@ on: env: # python version used to calculate and submit code coverage - COV_PYTHON_VERSION: "3.11" + COV_PYTHON_VERSION: "3.12" jobs: python-unit: runs-on: ubuntu-latest strategy: matrix: - python: ["3.9", "3.10", "3.11", "3.12"] + python: ["3.9", "3.10", "3.11", "3.12", "3.13"] defaults: run: working-directory: . diff --git a/.pythonversion b/.pythonversion new file mode 100644 index 0000000..4516194 --- /dev/null +++ b/.pythonversion @@ -0,0 +1 @@ +Python 3.12.7 diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 7817c88..6d4918c 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -31,10 +31,12 @@ If you cannot or prefer not to install git flow, you can do the equivalent manua ### Create a Python virtual environment -Use a recent version of python 3. We highly recommend using a python virtualenv, e.g. +Use a recent version of python 3 (we recommend 3.12). If you use [pyenv](https://github.com/pyenv/pyenv), run `pyenv install` to get the current recommended python version for development (specified in `.pythonversion`). + +We highly recommend using a python virtualenv to isolate dependencies, e.g. ``` -python3 -m venv undate -source undate/bin/activate +python3 -m venv .venv +source .venv/bin/activate ``` ### Install local version of undate with development python dependencies @@ -47,12 +49,12 @@ pip install -e ".[dev]" ### Install pre-commit hooks +We use [pre-commit](https://pre-commit.com/) for automated checks and consistent formatting. If you're planning to contribute, please install these when you set up your local development. + ```sh pre-commit install ``` -We use [pre-commit](https://pre-commit.com/) for automated checks and consistent formatting. If you're planning to contribute, please install these when you set up your local development. - ## Tests, documentation, and other checks ### Running unit tests @@ -65,7 +67,7 @@ To test cases by method name, use `-k`: `pytest -k test_str` ### Check python types -Python typing is currently enforced on pull requests as part of a GitHub Actions Continuous Integration check using `mypy`. +Python typing is currently enforced on pull requests as part of a GitHub Actions Continuous Integration check using `mypy` and via pre-commit hook. To check types locally: 1. Install the necessary typing libraries (first run only): diff --git a/pyproject.toml b/pyproject.toml index 3f3325b..d853632 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ authors = [ { name = "Julia Damerow" }, { name = "Robert Casties" }, { name = "Malte Vogl" }, - # {name = "DHTech", email="dhtech.community@gmail.com"} ? ] # currently no maintainers separate from authors @@ -47,7 +46,7 @@ classifiers = [ [project.optional-dependencies] docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"] dev = [ - "black>=22.10.0", + "ruff", "pre-commit>=2.20.0", "twine", "wheel", @@ -79,7 +78,7 @@ include = ["src/undate/**/*.py", "src/undate/**/*.lark", "/tests"] pythonpath = "src/" markers = [ "last : run marked tests after all others", - "first : run marked tests before all others", + "first : run marked tests efore all others", ] [tool.mypy] diff --git a/src/undate/date.py b/src/undate/date.py index 132384e..8d58e47 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -87,20 +87,22 @@ def __array_finalize__(self, obj): # custom properties to access year, month, day @property - def year(self): + def year(self) -> int: return int(str(self.astype("datetime64[Y]"))) @property - def month(self): + def month(self) -> int | None: # if date unit is year, don't return a month (only M/D) if self.dtype != "datetime64[Y]": return int(str(self.astype("datetime64[M]")).split("-")[-1]) + return None @property - def day(self): + def day(self) -> int | None: # only return a day if date unit is in days if self.dtype == "datetime64[D]": return int(str(self.astype("datetime64[D]")).split("-")[-1]) + return None def __sub__(self, other): # modify to conditionally return a timedelta object instead of a @@ -126,12 +128,16 @@ class DatePrecision(IntEnum): # it is: a day is more precise than a month, a month is more precise than a year, # (DatePrecision.year < DatePrecision.month) + #: century + CENTURY = 1 + #: decade + DECADE = 2 #: year - YEAR = 1 + YEAR = 3 #: month - MONTH = 2 + MONTH = 4 #: day - DAY = 3 + DAY = 5 def __str__(self): return f"{self.name}" diff --git a/src/undate/dateformat/base.py b/src/undate/dateformat/base.py index 46a25ba..59777b1 100644 --- a/src/undate/dateformat/base.py +++ b/src/undate/dateformat/base.py @@ -15,7 +15,7 @@ import importlib import logging import pkgutil -from functools import lru_cache # functools.cache not available until 3.9 +from functools import cache from typing import Dict, Type logger = logging.getLogger(__name__) @@ -39,7 +39,7 @@ def to_string(self, undate) -> str: # cache import class method to ensure we only import once @classmethod - @lru_cache + @cache def import_formatters(cls) -> int: """Import all undate.dateformat formatters so that they will be included in available formatters From ed3f493bac0b1a81381dfa96c61d5cbc4d2d8932 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 7 Nov 2024 17:03:52 -0500 Subject: [PATCH 12/15] Use union syntax for py3.9 type hints --- src/undate/date.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/undate/date.py b/src/undate/date.py index 8d58e47..134053c 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -91,14 +91,14 @@ def year(self) -> int: return int(str(self.astype("datetime64[Y]"))) @property - def month(self) -> int | None: + def month(self) -> Union[int, None]: # if date unit is year, don't return a month (only M/D) if self.dtype != "datetime64[Y]": return int(str(self.astype("datetime64[M]")).split("-")[-1]) return None @property - def day(self) -> int | None: + def day(self) -> Union[int, None]: # only return a day if date unit is in days if self.dtype == "datetime64[D]": return int(str(self.astype("datetime64[D]")).split("-")[-1]) From e197cb264abbaefba7711d046629e4a80284fe63 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 7 Nov 2024 17:09:26 -0500 Subject: [PATCH 13/15] Switch CI check from black to ruff --- .github/workflows/check.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 41d8fc3..d11cb61 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -23,9 +23,9 @@ jobs: run: pip install -e ".[dev]" if: steps.python-cache.outputs.cache-hit != 'true' - # check code style - - name: Run black - run: black src --check --diff + # check with ruff + - name: Run ruff + run: ruff check # check docs - name: Check that documentation can be built From 61be76fadd1cb4227ae15ab365d7e95deb99efd7 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 7 Nov 2024 17:18:59 -0500 Subject: [PATCH 14/15] Clean up to pass ruff and mypy checks --- .pre-commit-config.yaml | 4 ++-- src/undate/date.py | 4 ++-- src/undate/dateformat/__init__.py | 2 +- src/undate/dateformat/edtf/transformer.py | 2 +- src/undate/dateformat/iso8601.py | 6 +++--- src/undate/undate.py | 2 +- tests/test_dateformat/test_base.py | 2 +- tests/test_undate.py | 3 +-- 8 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f2e1151..dfeb0d9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.2 + rev: v1.13.0 hooks: - id: mypy - additional_dependencies: [types-python-dateutil, numpy] + additional_dependencies: [numpy] diff --git a/src/undate/date.py b/src/undate/date.py index 134053c..bac47f3 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -91,14 +91,14 @@ def year(self) -> int: return int(str(self.astype("datetime64[Y]"))) @property - def month(self) -> Union[int, None]: + def month(self) -> Optional[int]: # if date unit is year, don't return a month (only M/D) if self.dtype != "datetime64[Y]": return int(str(self.astype("datetime64[M]")).split("-")[-1]) return None @property - def day(self) -> Union[int, None]: + def day(self) -> Optional[int]: # only return a day if date unit is in days if self.dtype == "datetime64[D]": return int(str(self.astype("datetime64[D]")).split("-")[-1]) diff --git a/src/undate/dateformat/__init__.py b/src/undate/dateformat/__init__.py index 7092a80..5dc5c3c 100644 --- a/src/undate/dateformat/__init__.py +++ b/src/undate/dateformat/__init__.py @@ -1,3 +1,3 @@ -from undate.dateformat.base import BaseDateFormat +from undate.dateformat.base import BaseDateFormat as BaseDateFormat # from undate.dateformat.iso8601 import ISO8601DateFormat diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py index 2afab28..a5578de 100644 --- a/src/undate/dateformat/edtf/transformer.py +++ b/src/undate/dateformat/edtf/transformer.py @@ -68,4 +68,4 @@ def year_fivedigitsplus(self, token): # strip off the leading Y and convert to integer # TODO: undate is currently limited to 4-digit years # (datetime max year of 9999) - return tok.update(int(token[:1])) + return token.update(int(token[:1])) diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index 2366cc1..aa3296c 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -62,10 +62,10 @@ def to_string(self, undate: Undate) -> str: # and not others; force year to always be 4 digits if date_portion == "year": date_parts.append("%04d" % undate.earliest.year) - elif date_portion == "month": + elif date_portion == "month" and undate.earliest.month: date_parts.append("%02d" % undate.earliest.month) - elif date_portion == "day": - date_parts.append("%02d" % undate.earliest.day) + elif date_portion == "day" and undate.earliest.day: + date_parts.append("%02d" % undate.earliest.day) # type: ignore elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format diff --git a/src/undate/undate.py b/src/undate/undate.py index bf248ff..3ee5dc4 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -159,7 +159,7 @@ def __str__(self) -> str: f"{day:02d}" if isinstance(day, int) else day, ] # combine, skipping any values that are None - return "-".join([str(p) for p in parts if p != None]) + return "-".join([str(p) for p in parts if p is not None]) return self.formatter.to_string(self) diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py index 26a9b97..1d184db 100644 --- a/tests/test_dateformat/test_base.py +++ b/tests/test_dateformat/test_base.py @@ -7,7 +7,7 @@ class TestBaseDateFormat: def test_available_formatters(self): available_formatters = BaseDateFormat.available_formatters() - assert type(available_formatters) == dict + assert isinstance(available_formatters, dict) # NOTE: import _after_ generating available formatters # so we can confirm it gets loaded diff --git a/tests/test_undate.py b/tests/test_undate.py index c1d3792..39c1f86 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,7 +1,6 @@ import calendar -from datetime import date, timedelta +from datetime import date -import numpy as np import pytest from undate.date import Timedelta from undate.undate import Undate, UndateInterval From e494d7a2638b4e94c5038b02180ba6020d7a2820 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 7 Nov 2024 17:25:33 -0500 Subject: [PATCH 15/15] Update project classifiers --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d853632..7cc27da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,12 +35,16 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Intended Audience :: Developers", + "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering", "Topic :: Utilities", "Typing :: Typed", + ] [project.optional-dependencies]