From 0a2b6b088deabdb21e5f24a1bfd48f6ef8000580 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 2 Nov 2023 15:06:17 +0100 Subject: [PATCH 1/3] simplify code --- htmldate/core.py | 150 +++++++++++++++++--------------------------- tests/unit_tests.py | 38 +++-------- 2 files changed, 66 insertions(+), 122 deletions(-) diff --git a/htmldate/core.py b/htmldate/core.py index 3f88cf28..60318f19 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -193,6 +193,11 @@ def logstring(element: HtmlElement) -> str: NON_DIGITS_REGEX = re.compile(r"\D+$") +THREE_COMP_PATTERNS = ( + (THREE_PATTERN, THREE_CATCH), + (THREE_LOOSE_PATTERN, THREE_LOOSE_CATCH), +) + def examine_text( text: str, @@ -407,16 +412,18 @@ def search_pattern( pattern: Pattern[str], catch: Pattern[str], yearpat: Pattern[str], - original_date: bool, - min_date: datetime, - max_date: datetime, + options: Extractor, ) -> Optional[Match[str]]: """Chained candidate filtering and selection""" candidates = plausible_year_filter( - htmlstring, pattern=pattern, yearpat=yearpat, earliest=min_date, latest=max_date + htmlstring, + pattern=pattern, + yearpat=yearpat, + earliest=options.min, + latest=options.max, ) return select_candidate( - candidates, catch, yearpat, original_date, min_date, max_date + candidates, catch, yearpat, options.original, options.min, options.max ) @@ -483,18 +490,13 @@ def examine_abbr_elements( elif elem.text and len(elem.text) > 10: LOGGER.debug("abbr published found: %s", elem.text) reference = compare_reference(reference, elem.text, options) - # convert and return converted = check_extracted_reference(reference, options) - if converted is not None: - return converted - # try rescue in abbr content - dateresult = examine_date_elements( + # return or try rescue in abbr content + return converted or examine_date_elements( tree, ".//abbr", options, ) - if dateresult is not None: - return dateresult return None @@ -562,9 +564,7 @@ def examine_time_elements( reference = compare_reference(reference, elem.text, options) # else...? # return - converted = check_extracted_reference(reference, options) - if converted is not None: - return converted + return check_extracted_reference(reference, options) return None @@ -600,9 +600,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: COPYRIGHT_PATTERN, YEAR_PATTERN, YEAR_PATTERN, - options.original, - options.min, - options.max, + options, ) if bestmatch is not None: LOGGER.debug("Copyright detected: %s", bestmatch[0]) @@ -614,48 +612,26 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: # 3 components LOGGER.debug("3 components") # target URL characteristics - bestmatch = search_pattern( - htmlstring, - THREE_PATTERN, - THREE_CATCH, - YEAR_PATTERN, - options.original, - options.min, - options.max, - ) - result = filter_ymd_candidate( - bestmatch, - THREE_PATTERN, - options.original, - copyear, - options.format, - options.min, - options.max, - ) - if result is not None: - return result - - # more loosely structured data - bestmatch = search_pattern( - htmlstring, - THREE_LOOSE_PATTERN, - THREE_LOOSE_CATCH, - YEAR_PATTERN, - options.original, - options.min, - options.max, - ) - result = filter_ymd_candidate( - bestmatch, - THREE_LOOSE_PATTERN, - options.original, - copyear, - options.format, - options.min, - options.max, - ) - if result is not None: - return result + # then more loosely structured data + for patterns in THREE_COMP_PATTERNS: + bestmatch = search_pattern( + htmlstring, + patterns[0], + patterns[1], + YEAR_PATTERN, + options, + ) + result = filter_ymd_candidate( + bestmatch, + patterns[0], + options.original, + copyear, + options.format, + options.min, + options.max, + ) + if result is not None: + return result # YYYY-MM-DD/DD-MM-YYYY candidates = plausible_year_filter( @@ -694,9 +670,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: DATESTRINGS_PATTERN, DATESTRINGS_CATCH, YEAR_PATTERN, - options.original, - options.min, - options.max, + options, ) result = filter_ymd_candidate( bestmatch, @@ -749,9 +723,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: YYYYMM_PATTERN, YYYYMM_CATCH, YEAR_PATTERN, - options.original, - options.min, - options.max, + options, ) if bestmatch is not None: dateobject = datetime(int(bestmatch[1]), int(bestmatch[2]), 1) @@ -827,9 +799,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: SIMPLE_PATTERN, YEAR_PATTERN, YEAR_PATTERN, - options.original, - options.min, - options.max, + options, ) if bestmatch is not None: dateobject = datetime(int(bestmatch[0]), 1, 1) @@ -967,27 +937,22 @@ def find_date( date_expr = FAST_PREPEND + DATE_EXPRESSIONS # then look for expressions - dateresult = examine_date_elements( - search_tree, - date_expr, - options, - ) - if dateresult is not None: - return dateresult - - # look for expressions - dateresult = examine_date_elements( - search_tree, - ".//title|.//h1", - options, + # and try time elements + result = ( + examine_date_elements( + search_tree, + date_expr, + options, + ) + or examine_date_elements( + search_tree, + ".//title|.//h1", + options, + ) + or examine_time_elements(search_tree, options) ) - if dateresult is not None: - return dateresult - - # try time elements - time_result = examine_time_elements(search_tree, options) - if time_result is not None: - return time_result + if result is not None: + return result # TODO: decide on this # search in discarded parts (e.g. archive.org-banner) @@ -1023,11 +988,8 @@ def find_date( if not MIN_SEGMENT_LEN < len(segment) < MAX_SEGMENT_LEN: continue reference = compare_reference(reference, segment, options) - # return converted = check_extracted_reference(reference, options) - if converted is not None: - return converted - # search page HTML - return search_page(htmlstring, options) + # return or search page HTML + return converted or search_page(htmlstring, options) return None diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 104d808a..e3233f55 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1207,11 +1207,9 @@ def test_approximate_url(): ) -def test_search_pattern( - original_date=False, min_date=MIN_DATE, max_date=LATEST_POSSIBLE -): +def test_search_pattern(): """test pattern search in strings""" - # + options = Extractor(True, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT) pattern = re.compile(r"\D([0-9]{4}[/.-][0-9]{2})\D") catch = re.compile(r"([0-9]{4})[/.-]([0-9]{2})") yearpat = re.compile(r"^([12][0-9]{3})") @@ -1221,9 +1219,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is None ) @@ -1233,9 +1229,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is not None ) @@ -1245,9 +1239,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is None ) @@ -1257,9 +1249,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is not None ) @@ -1273,9 +1263,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is None ) @@ -1285,9 +1273,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is not None ) @@ -1301,9 +1287,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is None ) @@ -1313,9 +1297,7 @@ def test_search_pattern( pattern, catch, yearpat, - original_date, - min_date, - max_date, + options, ) is not None ) From dbf474c71e20dc188e11e6771f39b04b509ee64b Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 3 Nov 2023 13:52:04 +0100 Subject: [PATCH 2/3] fix coverage --- htmldate/core.py | 10 ++++++---- tests/unit_tests.py | 7 +++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/htmldate/core.py b/htmldate/core.py index 60318f19..eb118083 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -447,6 +447,7 @@ def examine_abbr_elements( options: Extractor, ) -> Optional[str]: """Scan the page for abbr elements and check if their content contains an eligible date""" + result = None elements = tree.findall(".//abbr") if elements is not None and len(elements) < MAX_POSSIBLE_CANDIDATES: reference = 0 @@ -492,12 +493,12 @@ def examine_abbr_elements( reference = compare_reference(reference, elem.text, options) converted = check_extracted_reference(reference, options) # return or try rescue in abbr content - return converted or examine_date_elements( + result = converted or examine_date_elements( tree, ".//abbr", options, ) - return None + return result def examine_time_elements( @@ -505,6 +506,7 @@ def examine_time_elements( options: Extractor, ) -> Optional[str]: """Scan the page for time elements and check if their content contains an eligible date""" + result = None elements = tree.findall(".//time") if elements is not None and len(elements) < MAX_POSSIBLE_CANDIDATES: # scan all the tags and look for the newest one @@ -564,8 +566,8 @@ def examine_time_elements( reference = compare_reference(reference, elem.text, options) # else...? # return - return check_extracted_reference(reference, options) - return None + result = check_extracted_reference(reference, options) + return result def normalize_match(match: Optional[Match[str]]) -> str: diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e3233f55..497fc345 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -175,6 +175,13 @@ def test_no_date(): ) is None ) + assert find_date("") is None + assert ( + find_date( + '', + ) + is None + ) def test_exact_date(): From 0a4128ee2b4661586eb037f6974643d6dc72f5f1 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 3 Nov 2023 14:12:12 +0100 Subject: [PATCH 3/3] simplify further --- htmldate/core.py | 24 +++++++----------------- tests/unit_tests.py | 34 ++++++++++++---------------------- 2 files changed, 19 insertions(+), 39 deletions(-) diff --git a/htmldate/core.py b/htmldate/core.py index eb118083..505bbe85 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -355,9 +355,7 @@ def select_candidate( occurrences: Counter_Type[str], catch: Pattern[str], yearpat: Pattern[str], - original_date: bool, - min_date: datetime, - max_date: datetime, + options: Extractor, ) -> Optional[Match[str]]: """Select a candidate among the most frequent matches""" if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES: @@ -372,7 +370,7 @@ def select_candidate( firstselect = occurrences.most_common(10) LOGGER.debug("firstselect: %s", firstselect) # sort and find probable candidates - bestones = sorted(firstselect, reverse=not original_date)[:2] + bestones = sorted(firstselect, reverse=not options.original)[:2] LOGGER.debug("bestones: %s", bestones) # plausibility heuristics @@ -385,7 +383,7 @@ def select_candidate( years[i] = year_match[1] dateobject = datetime(int(year_match[1]), 1, 1) validation[i] = is_valid_date( - dateobject, "%Y", earliest=min_date, latest=max_date + dateobject, "%Y", earliest=options.min, latest=options.max ) # safety net: plausibility @@ -422,9 +420,7 @@ def search_pattern( earliest=options.min, latest=options.max, ) - return select_candidate( - candidates, catch, yearpat, options.original, options.min, options.max - ) + return select_candidate(candidates, catch, yearpat, options) @lru_cache(maxsize=CACHE_SIZE) @@ -651,9 +647,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: replacement[candidate] = candidates[item] candidates = Counter(replacement) # select - bestmatch = select_candidate( - candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max - ) + bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) result = filter_ymd_candidate( bestmatch, SELECT_YMD_PATTERN, @@ -702,9 +696,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: candidate = normalize_match(match) replacement[candidate] = candidates[item] candidates = Counter(replacement) - bestmatch = select_candidate( - candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max - ) + bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) result = filter_ymd_candidate( bestmatch, SLASHES_PATTERN, @@ -760,9 +752,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: replacement[candidate] = candidates[item] candidates = Counter(replacement) # select - bestmatch = select_candidate( - candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max - ) + bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) result = filter_ymd_candidate( bestmatch, MMYYYY_PATTERN, diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 497fc345..9e04b0fa 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -860,9 +860,9 @@ def test_compare_reference(): assert compare_reference(1517500000, "2018-02-01", options) == 1517500000 -def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE): +def test_candidate_selection(): """test the algorithm for several candidates""" - original_date = False + options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT) # patterns catch = re.compile(r"([0-9]{4})-([0-9]{2})-([0-9]{2})") yearpat = re.compile(r"^([0-9]{4})") @@ -878,9 +878,7 @@ def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE): "2-28", ] ) - result = select_candidate( - occurrences, catch, yearpat, original_date, min_date, max_date - ) + result = select_candidate(occurrences, catch, yearpat, options) assert result is None # plausible occurrences = Counter( @@ -894,38 +892,30 @@ def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE): "2017-11-28", ] ) - result = select_candidate( - occurrences, catch, yearpat, original_date, min_date, max_date - ) + result = select_candidate(occurrences, catch, yearpat, options) assert result.group(0) == "2017-11-28" - original_date = True - result = select_candidate( - occurrences, catch, yearpat, original_date, min_date, max_date - ) + + options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, True, OUTPUTFORMAT) + result = select_candidate(occurrences, catch, yearpat, options) assert result.group(0) == "2016-07-12" # mix plausible/implausible occurrences = Counter( ["2116-12-23", "2116-12-23", "2116-12-23", "2017-08-11", "2017-08-11"] ) - result = select_candidate( - occurrences, catch, yearpat, original_date, min_date, max_date - ) + result = select_candidate(occurrences, catch, yearpat, options) assert result.group(0) == "2017-08-11" - original_date = False + + options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT) occurrences = Counter( ["2116-12-23", "2116-12-23", "2116-12-23", "2017-08-11", "2017-08-11"] ) - result = select_candidate( - occurrences, catch, yearpat, original_date, min_date, max_date - ) + result = select_candidate(occurrences, catch, yearpat, options) assert result.group(0) == "2017-08-11" # taking date present twice, corner case occurrences = Counter( ["2016-12-23", "2016-12-23", "2017-08-11", "2017-08-11", "2017-08-11"] ) - result = select_candidate( - occurrences, catch, yearpat, original_date, min_date, max_date - ) + result = select_candidate(occurrences, catch, yearpat, options) assert result.group(0) == "2016-12-23"