Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preliminary edtf parsing (levels 0 and some of 1) with lark #67

Merged
merged 7 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ packages = find:
python_requires = >=3.8
install_requires =
python-dateutil
lark

[options.package_data]
* =
*.lark

[options.extras_require]
all =
Expand Down
Empty file.
64 changes: 64 additions & 0 deletions src/undate/dateformat/edtf/edtf.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
%import common.ESCAPED_STRING -> STRING
%import common.INT -> INT
%import common.WS
%ignore WS

// --- EDTF / ISO 8601-2 ---

?edtf: edtf_level0 | edtf_level1

// --- EDTF Level 0 / ISO 8601-1 ---

?edtf_level0: date | timeinterval
// not implementing datetime for now

date: year | year "-" month | year "-" month "-" day

year: INT
month: /(0[1-9])|(1[0-2])/
day: /([0-2][1-9])|(3[0-1])/

timeinterval: date "/" date


// EDTF Level 1

?edtf_level1: date_level1 | extended_interval

// qualification may occur at the end of the date
qualification: uncertain | approximate | uncertain_approximate

uncertain: "?"
approximate: "~"
uncertain_approximate: "%"

// The character 'X' may be used in place of one or more rightmost
// digits to indicate that the value of that digit is unspecified
unspecified: /X/
?year_unspecified: /\d+/ unspecified+
?month_unspecified: "0".."1"? unspecified ~ 1..2
//?year_month_unspecified: year_l1 "-" month_unspecified
?day_unspecified: "0".."3"? unspecified ~ 1..2

// 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999.
year_fivedigitsplus: /Y\d{5,}/
?year_l1: year_fivedigitsplus | year | year_unspecified

// The values 21, 22, 23, 24 may be used used to signify
// ' Spring', 'Summer', 'Autumn', 'Winter', respectively,
// in place of a month value (01 through 12) for a year-and-month format string.
season: /2[1-4]/
?year_season: year_l1 "-" season

date_level1: (year_l1
| year_l1 "-" (month | month_unspecified)
| year_l1 "-" (month | month_unspecified) "-" (day | day_unspecified)
| year_season) qualification?

// unknown date: double dot or empty string
unknown_date: ".."?
extended_interval: date_level1 "/" date_level1
| date_level1 "/" unknown_date
| unknown_date "/" date_level1

// negative calendar year?
46 changes: 46 additions & 0 deletions src/undate/dateformat/edtf/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os.path

from lark import Lark


grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark")

with open(grammar_path) as grammar:
edtf_parser = Lark(grammar.read(), start="edtf")


# testcases = [
# "1984",
# "1984-05",
# "1984-12",
# "1001-03-30",
# "1000/2000",
# "1000-01/2000-05-01",
# # level 1
# "Y170000002",
# "2001-21", # spring 2001
# # qualifiers
# "1984?",
# "2004-06~",
# "2004-06-11%",
# # unspecified digits from right
# "201X",
# "20XX",
# "2004-XX",
# "1985-04-XX",
# "1985-XX-XX",
# # open ended intervals
# "1985-04-12/..",
# "1985-04/..",
# "../1985-04-12",
# "/1985-04-12",
# "1984-13",
# ]

# for testcase in testcases:
# print(f"\n{testcase}")
# tree = edtf_parser.parse(testcase)
# print(tree.pretty())


# error_cases = ["1984-13", "Y1702"]
70 changes: 70 additions & 0 deletions src/undate/dateformat/edtf/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from lark import Transformer, Tree, Token
from undate.undate import Undate, UndateInterval


class EDTFTransformer(Transformer):
"""transform edtf parse tree and return Undate or UndateInterval"""

INT = int

def timeinterval(self, items):
# transformed result from parser should be two undate objects;
# combine into an interval
return UndateInterval(*items)

def date(self, items):
parts = {}
for child in items:
if child.data in ["year", "month", "day"]:
# in each case we expect one value;
# anonymous tokens convert to their value
value = child.children[0]
# convert to integer when possible; otherwise pass as string
try:
value = int(value)
except ValueError:
value = str(value)
parts[str(child.data)] = value

return Undate(**parts)

def extended_interval(self, items):
# same as level 1 time interval, except one item may be None
# for an open-ended range
return self.timeinterval(items)

def unknown_date(self, token):
# unknown date for interval should be passed in as None
return None

def get_values(self, items):
# get a list of values from tokens; recurses to get subtree tokens
values = []
for i in items:
if isinstance(i, Token):
values.append(str(i))
if isinstance(i, Tree):
values.extend(self.get_values(i.children))
return values

def year_unspecified(self, items):
# combine parts (numeric & unknown) into a single string
value = "".join(self.get_values(items))
return Tree(data="year", children=[value])

def month_unspecified(self, items):
value = "".join(self.get_values(items))
return Tree(data="month", children=[value])

def day_unspecified(self, items):
value = "".join(self.get_values(items))
return Tree(data="day", children=[value])

def date_level1(self, items):
return self.date(items)

def year_fivedigitsplus(self, token):
# strip off the leading Y and convert to integer
# TODO: undate is currently limited to 4-digit years
# (datetime max year of 9999)
return tok.update(int(token[:1]))
46 changes: 46 additions & 0 deletions tests/test_dateformat/edtf/test_edtf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pytest

from undate.dateformat.edtf.parser import edtf_parser

# for now, just test that valid dates can be parsed

testcases = [
"1984",
"1984-05",
"1984-12",
"1001-03-30",
"1000/2000",
"1000-01/2000-05-01",
# level 1
"Y170000002",
"2001-21", # spring 2001
# qualifiers
"1984?",
"2004-06~",
"2004-06-11%",
# unspecified digits from right
"201X",
"20XX",
"2004-XX",
"1985-04-XX",
"1985-XX-XX",
# open ended intervals
"1985-04-12/..",
"1985-04/..",
"../1985-04-12",
"/1985-04-12",
]


@pytest.mark.parametrize("date_string", testcases)
def test_should_parse(date_string):
assert edtf_parser.parse(date_string)


error_cases = ["1984-13", "Y1702"]


@pytest.mark.parametrize("date_string", error_cases)
def test_should_error(date_string):
with pytest.raises(Exception):
edtf_parser.parse(date_string)
44 changes: 44 additions & 0 deletions tests/test_dateformat/edtf/test_edtf_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest

from undate.undate import Undate, UndateInterval
from undate.dateformat.edtf.parser import edtf_parser
from undate.dateformat.edtf.transformer import EDTFTransformer

# for now, just test that valid dates can be parsed

testcases = [
("1984", Undate(1984)),
("1984-05", Undate(1984, 5)),
("1984-12", Undate(1984, 12)),
("1001-03-30", Undate(1001, 3, 30)),
("1000/2000", UndateInterval(Undate(1000), Undate(2000))),
("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))),
# # level 1
# NOTE: undate currently doesn't most of the level 1 functionality
# NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR)
# ("Y17000002", Undate(17000002)),
# "2001-21", # spring 2001
# # qualifiers
# "1984?",
# "2004-06~",
# "2004-06-11%",
# # unspecified digits from right
("201X", Undate("201X")),
("20XX", Undate("20XX")),
("2004-XX", Undate(2004, "XX")),
("1985-04-XX", Undate(1985, 4, "XX")),
("1985-XX-XX", Undate(1985, "XX", "XX")),
# # open ended intervals
("1985-04-12/..", UndateInterval(Undate(1985, 4, 12), None)),
("1985-04/..", UndateInterval(Undate(1985, 4), None)),
("../1985-04-12", UndateInterval(None, Undate(1985, 4, 12))),
("/1985-04-12", UndateInterval(None, Undate(1985, 4, 12))),
]


@pytest.mark.parametrize("date_string,expected", testcases)
def test_transform(date_string, expected):
transformer = EDTFTransformer()
# parse the input string, then transform to undate object
parsetree = edtf_parser.parse(date_string)
assert transformer.transform(parsetree) == expected
7 changes: 5 additions & 2 deletions tests/test_dateformat/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@ def test_parse_to_string(self):
BaseDateFormat().to_string(1991)


@pytest.mark.first
def test_import_formatters_import_only_once(caplog):
# run first so we can confirm it runs once
# clear the cache, since any instantiation of an Undate
# object anywhere in the test suite will populate it
BaseDateFormat.import_formatters.cache_clear()

# run first, and confirm it runs and loads formatters
with caplog.at_level(logging.DEBUG):
import_count = BaseDateFormat.import_formatters()
# should import at least one thing (iso8601)
Expand Down
Loading