add further test case, make code stricter and simpler

adbar · Oct 12, 2023 · 61b634f · 61b634f
1 parent b24864f
commit 61b634f
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 20 deletions.
diff --git a/htmldate/core.py b/htmldate/core.py
@@ -14,7 +14,7 @@
 from copy import deepcopy
 from datetime import datetime
 from functools import lru_cache, partial
-from typing import Match, Optional, Pattern, Tuple, Union, Counter as Counter_Type
+from typing import Match, Optional, Pattern, Union, Counter as Counter_Type
 
 from lxml.html import HtmlElement, tostring  # type: ignore
 
@@ -637,15 +637,13 @@ def examine_time_elements(
     return None
 
 
-def normalize_match(match: Optional[Match[str]]) -> Tuple[str, str]:
-    """Normalize string output by adding "0" if necessary."""
-    day = match[1]  # type: ignore[index]
-    if len(day) == 1:
-        day = "0" + day
-    month = match[2]  # type: ignore[index]
-    if len(month) == 1:
-        month = "0" + month
-    return day, month
+def normalize_match(match: Optional[Match[str]]) -> str:
+    """Normalize string output by adding "0" if necessary,
+    and optionally expand the year from two to four digits."""
+    day, month, year = (g.zfill(2) for g in match.groups() if g)  # type: ignore[union-attr]
+    if len(year) == 2:
+        year = "19" + year if year[0] == "9" else "20" + year
+    return f"{year}-{month}-{day}"
 
 
 def search_page(
@@ -760,8 +758,7 @@ def search_page(
     replacement = {}
     for item in candidates:
         match = THREE_COMP_REGEX_A.match(item)
-        day, month = normalize_match(match)
-        candidate = "-".join([match[3], month, day])  # type: ignore[index]
+        candidate = normalize_match(match)
         replacement[candidate] = candidates[item]
     candidates = Counter(replacement)
     # select
@@ -815,12 +812,7 @@ def search_page(
     replacement = {}
     for item in candidates:
         match = THREE_COMP_REGEX_B.match(item)
-        day, month = normalize_match(match)
-        if match[3][0] == "9":  # type: ignore[index]
-            year = "19" + match[3]  # type: ignore[index]
-        else:
-            year = "20" + match[3]  # type: ignore[index]
-        candidate = "-".join([year, month, day])
+        candidate = normalize_match(match)
         replacement[candidate] = candidates[item]
     candidates = Counter(replacement)
     bestmatch = select_candidate(

diff --git a/htmldate/extractors.py b/htmldate/extractors.py
@@ -186,7 +186,9 @@
 
 # core patterns
 THREE_COMP_REGEX_A = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{4})")
-THREE_COMP_REGEX_B = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{2})")
+THREE_COMP_REGEX_B = re.compile(
+    r"([0-3]?[0-9])/([01]?[0-9])/([0-9]{2})|([0-3][0-9])[.-]([01][0-9])[.-]([0-9]{2})"
+)
 TWO_COMP_REGEX = re.compile(r"([0-3]?[0-9])[/.-]([0-9]{4})")
 
 # extensive search patterns
@@ -205,7 +207,9 @@
     r"(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)"
 )
 DATESTRINGS_CATCH = re.compile(r"([12][0-9]{3})([01][0-9])([0-3][0-9])")
-SLASHES_PATTERN = re.compile(r"\D([0-3]?[0-9][/.][01]?[0-9][/.][0129][0-9])\D")
+SLASHES_PATTERN = re.compile(
+    r"\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\D"
+)
 SLASHES_YEAR = re.compile(r"([0-9]{2})$")
 YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-][01][0-9])\D")
 YYYYMM_CATCH = re.compile(r"([12][0-9]{3})[/.-]([01][0-9])")

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1539,6 +1539,16 @@ def test_search_html(original_date=False, min_date=MIN_DATE, max_date=LATEST_POS
         )
         is None
     )
+    assert (
+        search_page(
+            '<html><body><link href="//homepagedesigner.telekom.de/.cm4all/res/static/beng-editor/5.1.98/css/deploy.css"/></body></html>',
+            OUTPUTFORMAT,
+            original_date,
+            min_date,
+            max_date,
+        )
+        is None
+    )
 
 
 def test_idiosyncrasies():