diff --git a/setup.cfg b/setup.cfg index 217c8ee..d716b0e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,6 +40,11 @@ packages = find: python_requires = >=3.8 install_requires = python-dateutil + lark + +[options.package_data] +* = + *.lark [options.extras_require] all = diff --git a/src/undate/dateformat/edtf/__init__.py b/src/undate/dateformat/edtf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/undate/dateformat/edtf/edtf.lark b/src/undate/dateformat/edtf/edtf.lark new file mode 100644 index 0000000..6b8e5aa --- /dev/null +++ b/src/undate/dateformat/edtf/edtf.lark @@ -0,0 +1,64 @@ +%import common.ESCAPED_STRING -> STRING +%import common.INT -> INT +%import common.WS +%ignore WS + +// --- EDTF / ISO 8601-2 --- + +?edtf: edtf_level0 | edtf_level1 + +// --- EDTF Level 0 / ISO 8601-1 --- + +?edtf_level0: date | timeinterval +// not implementing datetime for now + +date: year | year "-" month | year "-" month "-" day + +year: INT +month: /(0[1-9])|(1[0-2])/ +day: /([0-2][1-9])|(3[0-1])/ + +timeinterval: date "/" date + + +// EDTF Level 1 + +?edtf_level1: date_level1 | extended_interval + +// qualification may occur at the end of the date +qualification: uncertain | approximate | uncertain_approximate + +uncertain: "?" +approximate: "~" +uncertain_approximate: "%" + +// The character 'X' may be used in place of one or more rightmost +// digits to indicate that the value of that digit is unspecified +unspecified: /X/ +?year_unspecified: /\d+/ unspecified+ +?month_unspecified: "0".."1"? unspecified ~ 1..2 +//?year_month_unspecified: year_l1 "-" month_unspecified +?day_unspecified: "0".."3"? unspecified ~ 1..2 + +// 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999. +year_fivedigitsplus: /Y\d{5,}/ +?year_l1: year_fivedigitsplus | year | year_unspecified + +// The values 21, 22, 23, 24 may be used used to signify +// ' Spring', 'Summer', 'Autumn', 'Winter', respectively, +// in place of a month value (01 through 12) for a year-and-month format string. +season: /2[1-4]/ +?year_season: year_l1 "-" season + +date_level1: (year_l1 + | year_l1 "-" (month | month_unspecified) + | year_l1 "-" (month | month_unspecified) "-" (day | day_unspecified) + | year_season) qualification? + +// unknown date: double dot or empty string +unknown_date: ".."? +extended_interval: date_level1 "/" date_level1 + | date_level1 "/" unknown_date + | unknown_date "/" date_level1 + +// negative calendar year? \ No newline at end of file diff --git a/src/undate/dateformat/edtf/parser.py b/src/undate/dateformat/edtf/parser.py new file mode 100644 index 0000000..8826b2d --- /dev/null +++ b/src/undate/dateformat/edtf/parser.py @@ -0,0 +1,46 @@ +import os.path + +from lark import Lark + + +grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") + +with open(grammar_path) as grammar: + edtf_parser = Lark(grammar.read(), start="edtf") + + +# testcases = [ +# "1984", +# "1984-05", +# "1984-12", +# "1001-03-30", +# "1000/2000", +# "1000-01/2000-05-01", +# # level 1 +# "Y170000002", +# "2001-21", # spring 2001 +# # qualifiers +# "1984?", +# "2004-06~", +# "2004-06-11%", +# # unspecified digits from right +# "201X", +# "20XX", +# "2004-XX", +# "1985-04-XX", +# "1985-XX-XX", +# # open ended intervals +# "1985-04-12/..", +# "1985-04/..", +# "../1985-04-12", +# "/1985-04-12", +# "1984-13", +# ] + +# for testcase in testcases: +# print(f"\n{testcase}") +# tree = edtf_parser.parse(testcase) +# print(tree.pretty()) + + +# error_cases = ["1984-13", "Y1702"] diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py new file mode 100644 index 0000000..cca3609 --- /dev/null +++ b/src/undate/dateformat/edtf/transformer.py @@ -0,0 +1,70 @@ +from lark import Transformer, Tree, Token +from undate.undate import Undate, UndateInterval + + +class EDTFTransformer(Transformer): + """transform edtf parse tree and return Undate or UndateInterval""" + + INT = int + + def timeinterval(self, items): + # transformed result from parser should be two undate objects; + # combine into an interval + return UndateInterval(*items) + + def date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one value; + # anonymous tokens convert to their value + value = child.children[0] + # convert to integer when possible; otherwise pass as string + try: + value = int(value) + except ValueError: + value = str(value) + parts[str(child.data)] = value + + return Undate(**parts) + + def extended_interval(self, items): + # same as level 1 time interval, except one item may be None + # for an open-ended range + return self.timeinterval(items) + + def unknown_date(self, token): + # unknown date for interval should be passed in as None + return None + + def get_values(self, items): + # get a list of values from tokens; recurses to get subtree tokens + values = [] + for i in items: + if isinstance(i, Token): + values.append(str(i)) + if isinstance(i, Tree): + values.extend(self.get_values(i.children)) + return values + + def year_unspecified(self, items): + # combine parts (numeric & unknown) into a single string + value = "".join(self.get_values(items)) + return Tree(data="year", children=[value]) + + def month_unspecified(self, items): + value = "".join(self.get_values(items)) + return Tree(data="month", children=[value]) + + def day_unspecified(self, items): + value = "".join(self.get_values(items)) + return Tree(data="day", children=[value]) + + def date_level1(self, items): + return self.date(items) + + def year_fivedigitsplus(self, token): + # strip off the leading Y and convert to integer + # TODO: undate is currently limited to 4-digit years + # (datetime max year of 9999) + return tok.update(int(token[:1])) diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py new file mode 100644 index 0000000..5a2b8ea --- /dev/null +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -0,0 +1,46 @@ +import pytest + +from undate.dateformat.edtf.parser import edtf_parser + +# for now, just test that valid dates can be parsed + +testcases = [ + "1984", + "1984-05", + "1984-12", + "1001-03-30", + "1000/2000", + "1000-01/2000-05-01", + # level 1 + "Y170000002", + "2001-21", # spring 2001 + # qualifiers + "1984?", + "2004-06~", + "2004-06-11%", + # unspecified digits from right + "201X", + "20XX", + "2004-XX", + "1985-04-XX", + "1985-XX-XX", + # open ended intervals + "1985-04-12/..", + "1985-04/..", + "../1985-04-12", + "/1985-04-12", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert edtf_parser.parse(date_string) + + +error_cases = ["1984-13", "Y1702"] + + +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + edtf_parser.parse(date_string) diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py new file mode 100644 index 0000000..8dba34d --- /dev/null +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -0,0 +1,44 @@ +import pytest + +from undate.undate import Undate, UndateInterval +from undate.dateformat.edtf.parser import edtf_parser +from undate.dateformat.edtf.transformer import EDTFTransformer + +# for now, just test that valid dates can be parsed + +testcases = [ + ("1984", Undate(1984)), + ("1984-05", Undate(1984, 5)), + ("1984-12", Undate(1984, 12)), + ("1001-03-30", Undate(1001, 3, 30)), + ("1000/2000", UndateInterval(Undate(1000), Undate(2000))), + ("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))), + # # level 1 + # NOTE: undate currently doesn't most of the level 1 functionality + # NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR) + # ("Y17000002", Undate(17000002)), + # "2001-21", # spring 2001 + # # qualifiers + # "1984?", + # "2004-06~", + # "2004-06-11%", + # # unspecified digits from right + ("201X", Undate("201X")), + ("20XX", Undate("20XX")), + ("2004-XX", Undate(2004, "XX")), + ("1985-04-XX", Undate(1985, 4, "XX")), + ("1985-XX-XX", Undate(1985, "XX", "XX")), + # # open ended intervals + ("1985-04-12/..", UndateInterval(Undate(1985, 4, 12), None)), + ("1985-04/..", UndateInterval(Undate(1985, 4), None)), + ("../1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), + ("/1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), +] + + +@pytest.mark.parametrize("date_string,expected", testcases) +def test_transform(date_string, expected): + transformer = EDTFTransformer() + # parse the input string, then transform to undate object + parsetree = edtf_parser.parse(date_string) + assert transformer.transform(parsetree) == expected diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py index 63568f0..3687a37 100644 --- a/tests/test_dateformat/test_base.py +++ b/tests/test_dateformat/test_base.py @@ -31,9 +31,12 @@ def test_parse_to_string(self): BaseDateFormat().to_string(1991) -@pytest.mark.first def test_import_formatters_import_only_once(caplog): - # run first so we can confirm it runs once + # clear the cache, since any instantiation of an Undate + # object anywhere in the test suite will populate it + BaseDateFormat.import_formatters.cache_clear() + + # run first, and confirm it runs and loads formatters with caplog.at_level(logging.DEBUG): import_count = BaseDateFormat.import_formatters() # should import at least one thing (iso8601)