From ca4e1cbd5610ecd1670c70972602aea77ea85b70 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 3 Nov 2023 16:26:21 -0400 Subject: [PATCH 1/7] Preliminary edtf parsing (levels 0 and 1) with lark --- setup.cfg | 1 + src/undate/dateformat/edtf/__init__.py | 0 src/undate/dateformat/edtf/edtf.lark | 60 +++++++++++++++++++ src/undate/dateformat/edtf/parser.py | 46 ++++++++++++++ .../test_dateformat/edtf/test_edtf_parser.py | 46 ++++++++++++++ 5 files changed, 153 insertions(+) create mode 100644 src/undate/dateformat/edtf/__init__.py create mode 100644 src/undate/dateformat/edtf/edtf.lark create mode 100644 src/undate/dateformat/edtf/parser.py create mode 100644 tests/test_dateformat/edtf/test_edtf_parser.py diff --git a/setup.cfg b/setup.cfg index 217c8ee..bd5eb0e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,7 @@ install_requires = [options.extras_require] all = + lark %(dev)s %(test)s dev = diff --git a/src/undate/dateformat/edtf/__init__.py b/src/undate/dateformat/edtf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/undate/dateformat/edtf/edtf.lark b/src/undate/dateformat/edtf/edtf.lark new file mode 100644 index 0000000..370eee4 --- /dev/null +++ b/src/undate/dateformat/edtf/edtf.lark @@ -0,0 +1,60 @@ +%import common.ESCAPED_STRING -> STRING +%import common.INT -> INT +%import common.WS +%ignore WS + +// --- EDTF / ISO 8601-2 --- + +edtf: edtf_level0 | edtf_level1 + +// --- EDTF Level 0 / ISO 8601-1 --- + +edtf_level0: date | timeinterval +// not implementing datetime for now + +date: year | year_month | year_month_day + +year: INT +month: /(0[1-9])|(1[0-2])/ +year_month: year "-" month +day: /([0-2][1-9])|(3[0-1])/ +year_month_day: year "-" month "-" day + +timeinterval: date "/" date + + + +// EDTF Level 1 + +edtf_level1: date_level1 | extended_interval + +qualification: uncertain | approximate | uncertain_approximate + +uncertain: "?" +approximate: "~" +uncertain_approximate: "%" + +// The character 'X' may be used in place of one or more rightmost digits to indicate that the value of that digit is unspecified +unspecified: "X" +year_unspecified: /\d+/ unspecified+ +month_unspecified: "0".."1"? unspecified ~ 1..2 +year_month_unspecified: year_l1 "-" month_unspecified +day_unspecified: "0".."3"? unspecified ~ 1..2 +year_month_day_unspecified: year_l1 "-" (month | month_unspecified) "-" day_unspecified + +// 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999. +year_l1: ("Y" /\d{5,}/ | year | year_unspecified) qualification? +year_month_l1: (year_l1 "-" month | year_month_unspecified) qualification? +year_month_day_l1: (year_l1 "-" month "-" day | year_month_day_unspecified) qualification? + +// The values 21, 22, 23, 24 may be used used to signify ' Spring', 'Summer', 'Autumn', 'Winter', respectively, in place of a month value (01 through 12) for a year-and-month format string. +season: /2[1-4]/ +year_season: year_l1 "-" season + +date_level1: year_l1 | year_month_l1 | year_month_day_l1 | year_season + +// unknown date: double dot or empty string +unknown_date: ".."? +extended_interval: date_level1 "/" date_level1 | date_level1 "/" unknown_date | unknown_date "/" date_level1 + +// negative calendar year? \ No newline at end of file diff --git a/src/undate/dateformat/edtf/parser.py b/src/undate/dateformat/edtf/parser.py new file mode 100644 index 0000000..8826b2d --- /dev/null +++ b/src/undate/dateformat/edtf/parser.py @@ -0,0 +1,46 @@ +import os.path + +from lark import Lark + + +grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") + +with open(grammar_path) as grammar: + edtf_parser = Lark(grammar.read(), start="edtf") + + +# testcases = [ +# "1984", +# "1984-05", +# "1984-12", +# "1001-03-30", +# "1000/2000", +# "1000-01/2000-05-01", +# # level 1 +# "Y170000002", +# "2001-21", # spring 2001 +# # qualifiers +# "1984?", +# "2004-06~", +# "2004-06-11%", +# # unspecified digits from right +# "201X", +# "20XX", +# "2004-XX", +# "1985-04-XX", +# "1985-XX-XX", +# # open ended intervals +# "1985-04-12/..", +# "1985-04/..", +# "../1985-04-12", +# "/1985-04-12", +# "1984-13", +# ] + +# for testcase in testcases: +# print(f"\n{testcase}") +# tree = edtf_parser.parse(testcase) +# print(tree.pretty()) + + +# error_cases = ["1984-13", "Y1702"] diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py new file mode 100644 index 0000000..1c2f503 --- /dev/null +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -0,0 +1,46 @@ +import pytest + +from undate.dateformat.edtf.parser import edtf_parser + +# for now, just test that valid dates can be parsed + +testcases = [ + "1984", + "1984-05", + "1984-12", + "1001-03-30", + "1000/2000", + "1000-01/2000-05-01", + # level 1 + "Y170000002", + "2001-21", # spring 2001 + # qualifiers + "1984?", + "2004-06~", + "2004-06-11%", + # unspecified digits from right + "201X", + "20XX", + "2004-XX", + "1985-04-XX", + "1985-XX-XX", + # open ended intervals + "1985-04-12/..", + "1985-04/..", + "../1985-04-12", + "/1985-04-12", +] + + +def test_should_parse(): + for testcase in testcases: + assert edtf_parser.parse(testcase) + + +error_cases = ["1984-13", "Y1702"] + + +def test_should_error(): + for error_case in error_cases: + with pytest.raises(Exception): + edtf_parser.parse(error_cases) From 5d84edd435fdf051b2722cc5291b7369c210c68d Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 3 Nov 2023 16:35:54 -0400 Subject: [PATCH 2/7] Put lark dependency in the right place --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index bd5eb0e..12969eb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,10 +40,11 @@ packages = find: python_requires = >=3.8 install_requires = python-dateutil + lark + [options.extras_require] all = - lark %(dev)s %(test)s dev = From ff3ea6d996970937d450ff58b2613b1e05c23b26 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 3 Nov 2023 17:44:20 -0400 Subject: [PATCH 3/7] Include lark grammar file in python package data --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 12969eb..d716b0e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,9 @@ install_requires = python-dateutil lark +[options.package_data] +* = + *.lark [options.extras_require] all = From 12b645788d05b97628b446f5918e02722032e85f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 17 Nov 2023 09:30:20 -0500 Subject: [PATCH 4/7] Fix parsing for level 2 qualifier --- src/undate/dateformat/edtf/edtf.lark | 42 ++++++++++--------- .../test_dateformat/edtf/test_edtf_parser.py | 4 +- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/undate/dateformat/edtf/edtf.lark b/src/undate/dateformat/edtf/edtf.lark index 370eee4..2b971be 100644 --- a/src/undate/dateformat/edtf/edtf.lark +++ b/src/undate/dateformat/edtf/edtf.lark @@ -5,53 +5,55 @@ // --- EDTF / ISO 8601-2 --- -edtf: edtf_level0 | edtf_level1 +?edtf: edtf_level0 | edtf_level1 // --- EDTF Level 0 / ISO 8601-1 --- -edtf_level0: date | timeinterval +?edtf_level0: date | timeinterval // not implementing datetime for now -date: year | year_month | year_month_day +date: year | year "-" month | year "-" month "-" day year: INT month: /(0[1-9])|(1[0-2])/ -year_month: year "-" month day: /([0-2][1-9])|(3[0-1])/ -year_month_day: year "-" month "-" day timeinterval: date "/" date - // EDTF Level 1 -edtf_level1: date_level1 | extended_interval +?edtf_level1: date_level1 | extended_interval +// qualification may occur at the end of the date qualification: uncertain | approximate | uncertain_approximate uncertain: "?" approximate: "~" uncertain_approximate: "%" -// The character 'X' may be used in place of one or more rightmost digits to indicate that the value of that digit is unspecified -unspecified: "X" -year_unspecified: /\d+/ unspecified+ -month_unspecified: "0".."1"? unspecified ~ 1..2 -year_month_unspecified: year_l1 "-" month_unspecified +// The character 'X' may be used in place of one or more rightmost +// digits to indicate that the value of that digit is unspecified +unspecified: /X/ +?year_unspecified: /\d+/ unspecified+ +?month_unspecified: "0".."1"? unspecified ~ 1..2 +//?year_month_unspecified: year_l1 "-" month_unspecified day_unspecified: "0".."3"? unspecified ~ 1..2 -year_month_day_unspecified: year_l1 "-" (month | month_unspecified) "-" day_unspecified +?year_month_day_unspecified: year_l1 "-" (month | month_unspecified) "-" day_unspecified -// 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999. -year_l1: ("Y" /\d{5,}/ | year | year_unspecified) qualification? -year_month_l1: (year_l1 "-" month | year_month_unspecified) qualification? -year_month_day_l1: (year_l1 "-" month "-" day | year_month_day_unspecified) qualification? +// 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999. +year_fivedigitsplus: /Y\d{5,}/ +?year_l1: ( year_fivedigitsplus | year | year_unspecified) qualification? +//?year_month_l1: (year_l1 "-" month | year_month_unspecified) qualification? +?year_month_day_l1: year_l1 "-" month "-" day | year_month_day_unspecified -// The values 21, 22, 23, 24 may be used used to signify ' Spring', 'Summer', 'Autumn', 'Winter', respectively, in place of a month value (01 through 12) for a year-and-month format string. +// The values 21, 22, 23, 24 may be used used to signify +// ' Spring', 'Summer', 'Autumn', 'Winter', respectively, +// in place of a month value (01 through 12) for a year-and-month format string. season: /2[1-4]/ -year_season: year_l1 "-" season +?year_season: year_l1 "-" season -date_level1: year_l1 | year_month_l1 | year_month_day_l1 | year_season +date_level1: (year_l1 | year_l1 "-" (month | month_unspecified) | year_month_day_l1 | year_season) qualification? // unknown date: double dot or empty string unknown_date: ".."? diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py index 1c2f503..837d2b5 100644 --- a/tests/test_dateformat/edtf/test_edtf_parser.py +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -34,7 +34,9 @@ def test_should_parse(): for testcase in testcases: - assert edtf_parser.parse(testcase) + tree = edtf_parser.parse(testcase) + assert tree + print(tree.pretty()) error_cases = ["1984-13", "Y1702"] From e989415b6ff79df41a12407fdfaad522b921a55a Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 17 Nov 2023 09:49:43 -0500 Subject: [PATCH 5/7] Preliminary transformer to convert edtf parsetree into undate objects --- src/undate/dateformat/edtf/edtf.lark | 16 +++-- src/undate/dateformat/edtf/transformer.py | 70 +++++++++++++++++++ .../edtf/test_edtf_transformer.py | 53 ++++++++++++++ 3 files changed, 132 insertions(+), 7 deletions(-) create mode 100644 src/undate/dateformat/edtf/transformer.py create mode 100644 tests/test_dateformat/edtf/test_edtf_transformer.py diff --git a/src/undate/dateformat/edtf/edtf.lark b/src/undate/dateformat/edtf/edtf.lark index 2b971be..6b8e5aa 100644 --- a/src/undate/dateformat/edtf/edtf.lark +++ b/src/undate/dateformat/edtf/edtf.lark @@ -38,14 +38,11 @@ unspecified: /X/ ?year_unspecified: /\d+/ unspecified+ ?month_unspecified: "0".."1"? unspecified ~ 1..2 //?year_month_unspecified: year_l1 "-" month_unspecified -day_unspecified: "0".."3"? unspecified ~ 1..2 -?year_month_day_unspecified: year_l1 "-" (month | month_unspecified) "-" day_unspecified +?day_unspecified: "0".."3"? unspecified ~ 1..2 // 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999. year_fivedigitsplus: /Y\d{5,}/ -?year_l1: ( year_fivedigitsplus | year | year_unspecified) qualification? -//?year_month_l1: (year_l1 "-" month | year_month_unspecified) qualification? -?year_month_day_l1: year_l1 "-" month "-" day | year_month_day_unspecified +?year_l1: year_fivedigitsplus | year | year_unspecified // The values 21, 22, 23, 24 may be used used to signify // ' Spring', 'Summer', 'Autumn', 'Winter', respectively, @@ -53,10 +50,15 @@ year_fivedigitsplus: /Y\d{5,}/ season: /2[1-4]/ ?year_season: year_l1 "-" season -date_level1: (year_l1 | year_l1 "-" (month | month_unspecified) | year_month_day_l1 | year_season) qualification? +date_level1: (year_l1 + | year_l1 "-" (month | month_unspecified) + | year_l1 "-" (month | month_unspecified) "-" (day | day_unspecified) + | year_season) qualification? // unknown date: double dot or empty string unknown_date: ".."? -extended_interval: date_level1 "/" date_level1 | date_level1 "/" unknown_date | unknown_date "/" date_level1 +extended_interval: date_level1 "/" date_level1 + | date_level1 "/" unknown_date + | unknown_date "/" date_level1 // negative calendar year? \ No newline at end of file diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py new file mode 100644 index 0000000..cca3609 --- /dev/null +++ b/src/undate/dateformat/edtf/transformer.py @@ -0,0 +1,70 @@ +from lark import Transformer, Tree, Token +from undate.undate import Undate, UndateInterval + + +class EDTFTransformer(Transformer): + """transform edtf parse tree and return Undate or UndateInterval""" + + INT = int + + def timeinterval(self, items): + # transformed result from parser should be two undate objects; + # combine into an interval + return UndateInterval(*items) + + def date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one value; + # anonymous tokens convert to their value + value = child.children[0] + # convert to integer when possible; otherwise pass as string + try: + value = int(value) + except ValueError: + value = str(value) + parts[str(child.data)] = value + + return Undate(**parts) + + def extended_interval(self, items): + # same as level 1 time interval, except one item may be None + # for an open-ended range + return self.timeinterval(items) + + def unknown_date(self, token): + # unknown date for interval should be passed in as None + return None + + def get_values(self, items): + # get a list of values from tokens; recurses to get subtree tokens + values = [] + for i in items: + if isinstance(i, Token): + values.append(str(i)) + if isinstance(i, Tree): + values.extend(self.get_values(i.children)) + return values + + def year_unspecified(self, items): + # combine parts (numeric & unknown) into a single string + value = "".join(self.get_values(items)) + return Tree(data="year", children=[value]) + + def month_unspecified(self, items): + value = "".join(self.get_values(items)) + return Tree(data="month", children=[value]) + + def day_unspecified(self, items): + value = "".join(self.get_values(items)) + return Tree(data="day", children=[value]) + + def date_level1(self, items): + return self.date(items) + + def year_fivedigitsplus(self, token): + # strip off the leading Y and convert to integer + # TODO: undate is currently limited to 4-digit years + # (datetime max year of 9999) + return tok.update(int(token[:1])) diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py new file mode 100644 index 0000000..f8d70d8 --- /dev/null +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -0,0 +1,53 @@ +import pytest + +from undate.undate import Undate, UndateInterval +from undate.dateformat.edtf.parser import edtf_parser +from undate.dateformat.edtf.transformer import EDTFTransformer + +# for now, just test that valid dates can be parsed + +testcases = [ + ("1984", Undate(1984)), + ("1984-05", Undate(1984, 5)), + ("1984-12", Undate(1984, 12)), + ("1001-03-30", Undate(1001, 3, 30)), + ("1000/2000", UndateInterval(Undate(1000), Undate(2000))), + ("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))), + # # level 1 + # NOTE: undate currently doesn't most of the level 1 functionality + # NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR) + # ("Y17000002", Undate(17000002)), + # "2001-21", # spring 2001 + # # qualifiers + # "1984?", + # "2004-06~", + # "2004-06-11%", + # # unspecified digits from right + ("201X", Undate("201X")), + ("20XX", Undate("20XX")), + ("2004-XX", Undate(2004, "XX")), + ("1985-04-XX", Undate(1985, 4, "XX")), + ("1985-XX-XX", Undate(1985, "XX", "XX")), + # # open ended intervals + ("1985-04-12/..", UndateInterval(Undate(1985, 4, 12), None)), + ("1985-04/..", UndateInterval(Undate(1985, 4), None)), + ("../1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), + ("/1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), +] + + +def test_transform(): + transformer = EDTFTransformer() + + for testinput, output in testcases: + parsetree = edtf_parser.parse(testinput) + assert transformer.transform(parsetree) == output + + +# error_cases = ["1984-13", "Y1702"] + + +# def test_should_error(): +# for error_case in error_cases: +# with pytest.raises(Exception): +# edtf_parser.parse(error_cases) From f66c6f02e4b44d7be71bc8ef3ed03fbdc2ee7b24 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 27 Oct 2023 16:06:01 -0400 Subject: [PATCH 6/7] Clear cache before testing that foramtters are only loaded once --- tests/test_dateformat/test_base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py index 63568f0..3687a37 100644 --- a/tests/test_dateformat/test_base.py +++ b/tests/test_dateformat/test_base.py @@ -31,9 +31,12 @@ def test_parse_to_string(self): BaseDateFormat().to_string(1991) -@pytest.mark.first def test_import_formatters_import_only_once(caplog): - # run first so we can confirm it runs once + # clear the cache, since any instantiation of an Undate + # object anywhere in the test suite will populate it + BaseDateFormat.import_formatters.cache_clear() + + # run first, and confirm it runs and loads formatters with caplog.at_level(logging.DEBUG): import_count = BaseDateFormat.import_formatters() # should import at least one thing (iso8601) From 6221946c65fd17055d9d87cf6dfc9ed67a1caa57 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 19 Apr 2024 12:59:52 -0400 Subject: [PATCH 7/7] Clean up unit tests for parsing and transforming edtf dates --- .../test_dateformat/edtf/test_edtf_parser.py | 16 +++++++--------- .../edtf/test_edtf_transformer.py | 19 +++++-------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py index 837d2b5..5a2b8ea 100644 --- a/tests/test_dateformat/edtf/test_edtf_parser.py +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -32,17 +32,15 @@ ] -def test_should_parse(): - for testcase in testcases: - tree = edtf_parser.parse(testcase) - assert tree - print(tree.pretty()) +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert edtf_parser.parse(date_string) error_cases = ["1984-13", "Y1702"] -def test_should_error(): - for error_case in error_cases: - with pytest.raises(Exception): - edtf_parser.parse(error_cases) +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + edtf_parser.parse(date_string) diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py index f8d70d8..8dba34d 100644 --- a/tests/test_dateformat/edtf/test_edtf_transformer.py +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -36,18 +36,9 @@ ] -def test_transform(): +@pytest.mark.parametrize("date_string,expected", testcases) +def test_transform(date_string, expected): transformer = EDTFTransformer() - - for testinput, output in testcases: - parsetree = edtf_parser.parse(testinput) - assert transformer.transform(parsetree) == output - - -# error_cases = ["1984-13", "Y1702"] - - -# def test_should_error(): -# for error_case in error_cases: -# with pytest.raises(Exception): -# edtf_parser.parse(error_cases) + # parse the input string, then transform to undate object + parsetree = edtf_parser.parse(date_string) + assert transformer.transform(parsetree) == expected