From 0a2b6b088deabdb21e5f24a1bfd48f6ef8000580 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Thu, 2 Nov 2023 15:06:17 +0100
Subject: [PATCH 1/3] simplify code

---
 htmldate/core.py    | 150 +++++++++++++++++---------------------------
 tests/unit_tests.py |  38 +++--------
 2 files changed, 66 insertions(+), 122 deletions(-)

diff --git a/htmldate/core.py b/htmldate/core.py
index 3f88cf28..60318f19 100644
--- a/htmldate/core.py
+++ b/htmldate/core.py
@@ -193,6 +193,11 @@ def logstring(element: HtmlElement) -> str:
 
 NON_DIGITS_REGEX = re.compile(r"\D+$")
 
+THREE_COMP_PATTERNS = (
+    (THREE_PATTERN, THREE_CATCH),
+    (THREE_LOOSE_PATTERN, THREE_LOOSE_CATCH),
+)
+
 
 def examine_text(
     text: str,
@@ -407,16 +412,18 @@ def search_pattern(
     pattern: Pattern[str],
     catch: Pattern[str],
     yearpat: Pattern[str],
-    original_date: bool,
-    min_date: datetime,
-    max_date: datetime,
+    options: Extractor,
 ) -> Optional[Match[str]]:
     """Chained candidate filtering and selection"""
     candidates = plausible_year_filter(
-        htmlstring, pattern=pattern, yearpat=yearpat, earliest=min_date, latest=max_date
+        htmlstring,
+        pattern=pattern,
+        yearpat=yearpat,
+        earliest=options.min,
+        latest=options.max,
     )
     return select_candidate(
-        candidates, catch, yearpat, original_date, min_date, max_date
+        candidates, catch, yearpat, options.original, options.min, options.max
     )
 
 
@@ -483,18 +490,13 @@ def examine_abbr_elements(
                 elif elem.text and len(elem.text) > 10:
                     LOGGER.debug("abbr published found: %s", elem.text)
                     reference = compare_reference(reference, elem.text, options)
-        # convert and return
         converted = check_extracted_reference(reference, options)
-        if converted is not None:
-            return converted
-        # try rescue in abbr content
-        dateresult = examine_date_elements(
+        # return or try rescue in abbr content
+        return converted or examine_date_elements(
             tree,
             ".//abbr",
             options,
         )
-        if dateresult is not None:
-            return dateresult
     return None
 
 
@@ -562,9 +564,7 @@ def examine_time_elements(
                 reference = compare_reference(reference, elem.text, options)
             # else...?
         # return
-        converted = check_extracted_reference(reference, options)
-        if converted is not None:
-            return converted
+        return check_extracted_reference(reference, options)
     return None
 
 
@@ -600,9 +600,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         COPYRIGHT_PATTERN,
         YEAR_PATTERN,
         YEAR_PATTERN,
-        options.original,
-        options.min,
-        options.max,
+        options,
     )
     if bestmatch is not None:
         LOGGER.debug("Copyright detected: %s", bestmatch[0])
@@ -614,48 +612,26 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
     # 3 components
     LOGGER.debug("3 components")
     # target URL characteristics
-    bestmatch = search_pattern(
-        htmlstring,
-        THREE_PATTERN,
-        THREE_CATCH,
-        YEAR_PATTERN,
-        options.original,
-        options.min,
-        options.max,
-    )
-    result = filter_ymd_candidate(
-        bestmatch,
-        THREE_PATTERN,
-        options.original,
-        copyear,
-        options.format,
-        options.min,
-        options.max,
-    )
-    if result is not None:
-        return result
-
-    # more loosely structured data
-    bestmatch = search_pattern(
-        htmlstring,
-        THREE_LOOSE_PATTERN,
-        THREE_LOOSE_CATCH,
-        YEAR_PATTERN,
-        options.original,
-        options.min,
-        options.max,
-    )
-    result = filter_ymd_candidate(
-        bestmatch,
-        THREE_LOOSE_PATTERN,
-        options.original,
-        copyear,
-        options.format,
-        options.min,
-        options.max,
-    )
-    if result is not None:
-        return result
+    # then more loosely structured data
+    for patterns in THREE_COMP_PATTERNS:
+        bestmatch = search_pattern(
+            htmlstring,
+            patterns[0],
+            patterns[1],
+            YEAR_PATTERN,
+            options,
+        )
+        result = filter_ymd_candidate(
+            bestmatch,
+            patterns[0],
+            options.original,
+            copyear,
+            options.format,
+            options.min,
+            options.max,
+        )
+        if result is not None:
+            return result
 
     # YYYY-MM-DD/DD-MM-YYYY
     candidates = plausible_year_filter(
@@ -694,9 +670,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         DATESTRINGS_PATTERN,
         DATESTRINGS_CATCH,
         YEAR_PATTERN,
-        options.original,
-        options.min,
-        options.max,
+        options,
     )
     result = filter_ymd_candidate(
         bestmatch,
@@ -749,9 +723,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         YYYYMM_PATTERN,
         YYYYMM_CATCH,
         YEAR_PATTERN,
-        options.original,
-        options.min,
-        options.max,
+        options,
     )
     if bestmatch is not None:
         dateobject = datetime(int(bestmatch[1]), int(bestmatch[2]), 1)
@@ -827,9 +799,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         SIMPLE_PATTERN,
         YEAR_PATTERN,
         YEAR_PATTERN,
-        options.original,
-        options.min,
-        options.max,
+        options,
     )
     if bestmatch is not None:
         dateobject = datetime(int(bestmatch[0]), 1, 1)
@@ -967,27 +937,22 @@ def find_date(
         date_expr = FAST_PREPEND + DATE_EXPRESSIONS
 
     # then look for expressions
-    dateresult = examine_date_elements(
-        search_tree,
-        date_expr,
-        options,
-    )
-    if dateresult is not None:
-        return dateresult
-
-    # look for expressions
-    dateresult = examine_date_elements(
-        search_tree,
-        ".//title|.//h1",
-        options,
+    # and try time elements
+    result = (
+        examine_date_elements(
+            search_tree,
+            date_expr,
+            options,
+        )
+        or examine_date_elements(
+            search_tree,
+            ".//title|.//h1",
+            options,
+        )
+        or examine_time_elements(search_tree, options)
     )
-    if dateresult is not None:
-        return dateresult
-
-    # try time elements
-    time_result = examine_time_elements(search_tree, options)
-    if time_result is not None:
-        return time_result
+    if result is not None:
+        return result
 
     # TODO: decide on this
     # search in discarded parts (e.g. archive.org-banner)
@@ -1023,11 +988,8 @@ def find_date(
             if not MIN_SEGMENT_LEN < len(segment) < MAX_SEGMENT_LEN:
                 continue
             reference = compare_reference(reference, segment, options)
-        # return
         converted = check_extracted_reference(reference, options)
-        if converted is not None:
-            return converted
-        # search page HTML
-        return search_page(htmlstring, options)
+        # return or search page HTML
+        return converted or search_page(htmlstring, options)
 
     return None
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 104d808a..e3233f55 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1207,11 +1207,9 @@ def test_approximate_url():
     )
 
 
-def test_search_pattern(
-    original_date=False, min_date=MIN_DATE, max_date=LATEST_POSSIBLE
-):
+def test_search_pattern():
     """test pattern search in strings"""
-    #
+    options = Extractor(True, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT)
     pattern = re.compile(r"\D([0-9]{4}[/.-][0-9]{2})\D")
     catch = re.compile(r"([0-9]{4})[/.-]([0-9]{2})")
     yearpat = re.compile(r"^([12][0-9]{3})")
@@ -1221,9 +1219,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is None
     )
@@ -1233,9 +1229,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is not None
     )
@@ -1245,9 +1239,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is None
     )
@@ -1257,9 +1249,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is not None
     )
@@ -1273,9 +1263,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is None
     )
@@ -1285,9 +1273,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is not None
     )
@@ -1301,9 +1287,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is None
     )
@@ -1313,9 +1297,7 @@ def test_search_pattern(
             pattern,
             catch,
             yearpat,
-            original_date,
-            min_date,
-            max_date,
+            options,
         )
         is not None
     )

From dbf474c71e20dc188e11e6771f39b04b509ee64b Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Fri, 3 Nov 2023 13:52:04 +0100
Subject: [PATCH 2/3] fix coverage

---
 htmldate/core.py    | 10 ++++++----
 tests/unit_tests.py |  7 +++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/htmldate/core.py b/htmldate/core.py
index 60318f19..eb118083 100644
--- a/htmldate/core.py
+++ b/htmldate/core.py
@@ -447,6 +447,7 @@ def examine_abbr_elements(
     options: Extractor,
 ) -> Optional[str]:
     """Scan the page for abbr elements and check if their content contains an eligible date"""
+    result = None
     elements = tree.findall(".//abbr")
     if elements is not None and len(elements) < MAX_POSSIBLE_CANDIDATES:
         reference = 0
@@ -492,12 +493,12 @@ def examine_abbr_elements(
                     reference = compare_reference(reference, elem.text, options)
         converted = check_extracted_reference(reference, options)
         # return or try rescue in abbr content
-        return converted or examine_date_elements(
+        result = converted or examine_date_elements(
             tree,
             ".//abbr",
             options,
         )
-    return None
+    return result
 
 
 def examine_time_elements(
@@ -505,6 +506,7 @@ def examine_time_elements(
     options: Extractor,
 ) -> Optional[str]:
     """Scan the page for time elements and check if their content contains an eligible date"""
+    result = None
     elements = tree.findall(".//time")
     if elements is not None and len(elements) < MAX_POSSIBLE_CANDIDATES:
         # scan all the tags and look for the newest one
@@ -564,8 +566,8 @@ def examine_time_elements(
                 reference = compare_reference(reference, elem.text, options)
             # else...?
         # return
-        return check_extracted_reference(reference, options)
-    return None
+        result = check_extracted_reference(reference, options)
+    return result
 
 
 def normalize_match(match: Optional[Match[str]]) -> str:
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index e3233f55..497fc345 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -175,6 +175,13 @@ def test_no_date():
         )
         is None
     )
+    assert find_date("<html><body><time></time></body></html>") is None
+    assert (
+        find_date(
+            '<html><body><abbr class="published"></abbr></body></html>',
+        )
+        is None
+    )
 
 
 def test_exact_date():

From 0a4128ee2b4661586eb037f6974643d6dc72f5f1 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Fri, 3 Nov 2023 14:12:12 +0100
Subject: [PATCH 3/3] simplify further

---
 htmldate/core.py    | 24 +++++++-----------------
 tests/unit_tests.py | 34 ++++++++++++----------------------
 2 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/htmldate/core.py b/htmldate/core.py
index eb118083..505bbe85 100644
--- a/htmldate/core.py
+++ b/htmldate/core.py
@@ -355,9 +355,7 @@ def select_candidate(
     occurrences: Counter_Type[str],
     catch: Pattern[str],
     yearpat: Pattern[str],
-    original_date: bool,
-    min_date: datetime,
-    max_date: datetime,
+    options: Extractor,
 ) -> Optional[Match[str]]:
     """Select a candidate among the most frequent matches"""
     if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES:
@@ -372,7 +370,7 @@ def select_candidate(
     firstselect = occurrences.most_common(10)
     LOGGER.debug("firstselect: %s", firstselect)
     # sort and find probable candidates
-    bestones = sorted(firstselect, reverse=not original_date)[:2]
+    bestones = sorted(firstselect, reverse=not options.original)[:2]
     LOGGER.debug("bestones: %s", bestones)
 
     # plausibility heuristics
@@ -385,7 +383,7 @@ def select_candidate(
             years[i] = year_match[1]
             dateobject = datetime(int(year_match[1]), 1, 1)
             validation[i] = is_valid_date(
-                dateobject, "%Y", earliest=min_date, latest=max_date
+                dateobject, "%Y", earliest=options.min, latest=options.max
             )
 
     # safety net: plausibility
@@ -422,9 +420,7 @@ def search_pattern(
         earliest=options.min,
         latest=options.max,
     )
-    return select_candidate(
-        candidates, catch, yearpat, options.original, options.min, options.max
-    )
+    return select_candidate(candidates, catch, yearpat, options)
 
 
 @lru_cache(maxsize=CACHE_SIZE)
@@ -651,9 +647,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         replacement[candidate] = candidates[item]
     candidates = Counter(replacement)
     # select
-    bestmatch = select_candidate(
-        candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max
-    )
+    bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
     result = filter_ymd_candidate(
         bestmatch,
         SELECT_YMD_PATTERN,
@@ -702,9 +696,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         candidate = normalize_match(match)
         replacement[candidate] = candidates[item]
     candidates = Counter(replacement)
-    bestmatch = select_candidate(
-        candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max
-    )
+    bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
     result = filter_ymd_candidate(
         bestmatch,
         SLASHES_PATTERN,
@@ -760,9 +752,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         replacement[candidate] = candidates[item]
     candidates = Counter(replacement)
     # select
-    bestmatch = select_candidate(
-        candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max
-    )
+    bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
     result = filter_ymd_candidate(
         bestmatch,
         MMYYYY_PATTERN,
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 497fc345..9e04b0fa 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -860,9 +860,9 @@ def test_compare_reference():
     assert compare_reference(1517500000, "2018-02-01", options) == 1517500000
 
 
-def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE):
+def test_candidate_selection():
     """test the algorithm for several candidates"""
-    original_date = False
+    options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT)
     # patterns
     catch = re.compile(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")
     yearpat = re.compile(r"^([0-9]{4})")
@@ -878,9 +878,7 @@ def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE):
             "2-28",
         ]
     )
-    result = select_candidate(
-        occurrences, catch, yearpat, original_date, min_date, max_date
-    )
+    result = select_candidate(occurrences, catch, yearpat, options)
     assert result is None
     # plausible
     occurrences = Counter(
@@ -894,38 +892,30 @@ def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE):
             "2017-11-28",
         ]
     )
-    result = select_candidate(
-        occurrences, catch, yearpat, original_date, min_date, max_date
-    )
+    result = select_candidate(occurrences, catch, yearpat, options)
     assert result.group(0) == "2017-11-28"
-    original_date = True
-    result = select_candidate(
-        occurrences, catch, yearpat, original_date, min_date, max_date
-    )
+
+    options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, True, OUTPUTFORMAT)
+    result = select_candidate(occurrences, catch, yearpat, options)
     assert result.group(0) == "2016-07-12"
     # mix plausible/implausible
     occurrences = Counter(
         ["2116-12-23", "2116-12-23", "2116-12-23", "2017-08-11", "2017-08-11"]
     )
-    result = select_candidate(
-        occurrences, catch, yearpat, original_date, min_date, max_date
-    )
+    result = select_candidate(occurrences, catch, yearpat, options)
     assert result.group(0) == "2017-08-11"
-    original_date = False
+
+    options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT)
     occurrences = Counter(
         ["2116-12-23", "2116-12-23", "2116-12-23", "2017-08-11", "2017-08-11"]
     )
-    result = select_candidate(
-        occurrences, catch, yearpat, original_date, min_date, max_date
-    )
+    result = select_candidate(occurrences, catch, yearpat, options)
     assert result.group(0) == "2017-08-11"
     # taking date present twice, corner case
     occurrences = Counter(
         ["2016-12-23", "2016-12-23", "2017-08-11", "2017-08-11", "2017-08-11"]
     )
-    result = select_candidate(
-        occurrences, catch, yearpat, original_date, min_date, max_date
-    )
+    result = select_candidate(occurrences, catch, yearpat, options)
     assert result.group(0) == "2016-12-23"