Skip to content

Commit

Permalink
add further test case, make code stricter and simpler
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 12, 2023
1 parent b24864f commit 61b634f
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 20 deletions.
28 changes: 10 additions & 18 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from copy import deepcopy
from datetime import datetime
from functools import lru_cache, partial
from typing import Match, Optional, Pattern, Tuple, Union, Counter as Counter_Type
from typing import Match, Optional, Pattern, Union, Counter as Counter_Type

from lxml.html import HtmlElement, tostring # type: ignore

Expand Down Expand Up @@ -637,15 +637,13 @@ def examine_time_elements(
return None


def normalize_match(match: Optional[Match[str]]) -> Tuple[str, str]:
"""Normalize string output by adding "0" if necessary."""
day = match[1] # type: ignore[index]
if len(day) == 1:
day = "0" + day
month = match[2] # type: ignore[index]
if len(month) == 1:
month = "0" + month
return day, month
def normalize_match(match: Optional[Match[str]]) -> str:
"""Normalize string output by adding "0" if necessary,
and optionally expand the year from two to four digits."""
day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr]
if len(year) == 2:
year = "19" + year if year[0] == "9" else "20" + year
return f"{year}-{month}-{day}"


def search_page(
Expand Down Expand Up @@ -760,8 +758,7 @@ def search_page(
replacement = {}
for item in candidates:
match = THREE_COMP_REGEX_A.match(item)
day, month = normalize_match(match)
candidate = "-".join([match[3], month, day]) # type: ignore[index]
candidate = normalize_match(match)
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
# select
Expand Down Expand Up @@ -815,12 +812,7 @@ def search_page(
replacement = {}
for item in candidates:
match = THREE_COMP_REGEX_B.match(item)
day, month = normalize_match(match)
if match[3][0] == "9": # type: ignore[index]
year = "19" + match[3] # type: ignore[index]
else:
year = "20" + match[3] # type: ignore[index]
candidate = "-".join([year, month, day])
candidate = normalize_match(match)
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
bestmatch = select_candidate(
Expand Down
8 changes: 6 additions & 2 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,9 @@

# core patterns
THREE_COMP_REGEX_A = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{4})")
THREE_COMP_REGEX_B = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{2})")
THREE_COMP_REGEX_B = re.compile(
r"([0-3]?[0-9])/([01]?[0-9])/([0-9]{2})|([0-3][0-9])[.-]([01][0-9])[.-]([0-9]{2})"
)
TWO_COMP_REGEX = re.compile(r"([0-3]?[0-9])[/.-]([0-9]{4})")

# extensive search patterns
Expand All @@ -205,7 +207,9 @@
r"(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)"
)
DATESTRINGS_CATCH = re.compile(r"([12][0-9]{3})([01][0-9])([0-3][0-9])")
SLASHES_PATTERN = re.compile(r"\D([0-3]?[0-9][/.][01]?[0-9][/.][0129][0-9])\D")
SLASHES_PATTERN = re.compile(
r"\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\D"
)
SLASHES_YEAR = re.compile(r"([0-9]{2})$")
YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-][01][0-9])\D")
YYYYMM_CATCH = re.compile(r"([12][0-9]{3})[/.-]([01][0-9])")
Expand Down
10 changes: 10 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,16 @@ def test_search_html(original_date=False, min_date=MIN_DATE, max_date=LATEST_POS
)
is None
)
assert (
search_page(
'<html><body><link href="//homepagedesigner.telekom.de/.cm4all/res/static/beng-editor/5.1.98/css/deploy.css"/></body></html>',
OUTPUTFORMAT,
original_date,
min_date,
max_date,
)
is None
)


def test_idiosyncrasies():
Expand Down

0 comments on commit 61b634f

Please sign in to comment.