From 522db7deec1a6b83ed768d08a43c07f26a51397c Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 15 Nov 2022 16:44:32 +0100 Subject: [PATCH] extensive search: regex_parse on whole document (#67) --- htmldate/core.py | 31 ++++++++++++++++--------------- tests/unit_tests.py | 3 +-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/htmldate/core.py b/htmldate/core.py index c8e13e49..e9b2b29d 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -26,7 +26,7 @@ idiosyncrasies_search, img_search, json_search, - # regex_parse, + regex_parse, timestamp_search, try_date_expr, DATE_EXPRESSIONS, @@ -846,6 +846,20 @@ def search_page( if result is not None: return result + # try full-blown text regex on all HTML? + dateobject = regex_parse(htmlstring) + # todo: find all candidates and disambiguate? + if date_validator( + dateobject, outputformat, earliest=min_date, latest=max_date + ) is True and ( + copyear == 0 or dateobject.year >= copyear # type: ignore[union-attr] + ): + try: + LOGGER.debug("regex result on HTML: %s", dateobject) + return dateobject.strftime(outputformat) # type: ignore + except ValueError as err: + LOGGER.error("value error during conversion: %s %s", dateobject, err) + # catchall: copyright mention if copyear != 0: LOGGER.debug("using copyright year as default") @@ -853,20 +867,7 @@ def search_page( "-".join([str(copyear), "01", "01"]), "%Y-%m-%d", outputformat ) - # try full-blown text regex on all HTML? - # dateobject = regex_parse(htmlstring) - # todo: find all candidates and disambiguate? - # if ( - # date_validator(dateobject, outputformat, earliest=min_date, latest=max_date) - # is True - # ): - # try: - # LOGGER.debug("regex result on HTML: %s", dateobject) - # return dateobject.strftime(outputformat) # type: ignore - # except ValueError as err: - # LOGGER.error("value error during conversion: %s %s", string, err) - - # 1 component, last try + # last resort: 1 component LOGGER.debug("switching to one component") bestmatch = search_pattern( htmlstring, diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 2af4cfbd..4a9c0ee7 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -559,8 +559,7 @@ def test_exact_date(): '', outputformat="%Y-%m-%d %H:%m:%S", ) - == "2020-01-01 00:01:00" # solution so far - # == "2020-11-29 00:11:00" # better + == "2020-11-29 00:11:00" # 00:11 unclear ) assert ( find_date(