From beef5514dda6038f95b65882ae0637393094db4d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 26 Oct 2023 15:38:59 +0200 Subject: [PATCH] simplify further --- htmldate/core.py | 38 ++++++++++++++++++-------------------- htmldate/extractors.py | 39 +++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/htmldate/core.py b/htmldate/core.py index 44145ac3..f7f69cb3 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -26,7 +26,7 @@ img_search, json_search, regex_parse, - timestamp_search, + pattern_search, try_date_expr, DATE_EXPRESSIONS, FAST_PREPEND, @@ -37,6 +37,7 @@ YEAR_PATTERN, YMD_PATTERN, COPYRIGHT_PATTERN, + TIMESTAMP_PATTERN, THREE_PATTERN, THREE_CATCH, THREE_LOOSE_PATTERN, @@ -201,14 +202,13 @@ def examine_text( max_date: datetime, ) -> Optional[str]: "Prepare text and try to extract a date." - attempt = None text = trim_text(text) - if len(text) > MIN_SEGMENT_LEN: - text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN]) - attempt = try_date_expr( - text, outputformat, extensive_search, min_date, max_date - ) - return attempt + + if len(text) <= MIN_SEGMENT_LEN: + return None + + text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN]) + return try_date_expr(text, outputformat, extensive_search, min_date, max_date) def examine_date_elements( @@ -225,17 +225,13 @@ def examine_date_elements( return None for elem in elements: - attempt = examine_text( - elem.text_content(), outputformat, extensive_search, min_date, max_date - ) - if attempt: - return attempt - # try link title (Blogspot) - attempt = examine_text( - elem.get("title", ""), outputformat, extensive_search, min_date, max_date - ) - if attempt: - return attempt + # try element text and link title (Blogspot) + for text in [elem.text_content(), elem.get("title", "")]: + attempt = examine_text( + text, outputformat, extensive_search, min_date, max_date + ) + if attempt: + return attempt return None @@ -1091,7 +1087,9 @@ def find_date( htmlstring = tostring(search_tree, pretty_print=False).decode("utf-8", "ignore") # date regex timestamp rescue - timestamp_result = timestamp_search(htmlstring, outputformat, min_date, max_date) + timestamp_result = pattern_search( + htmlstring, TIMESTAMP_PATTERN, outputformat, min_date, max_date + ) if timestamp_result is not None: return timestamp_result diff --git a/htmldate/extractors.py b/htmldate/extractors.py index cfb1ebcc..d3cd346b 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -12,7 +12,7 @@ from datetime import datetime from functools import lru_cache -from typing import List, Optional, Tuple +from typing import List, Optional, Pattern, Tuple # coverage for date parsing from dateparser import DateDataParser # type: ignore # third-party, slow @@ -463,6 +463,23 @@ def img_search( return None +def pattern_search( + text: str, + date_pattern: Pattern[str], + outputformat: str, + min_date: datetime, + max_date: datetime, +) -> Optional[str]: + "Look for date expressions using a regular expression on a string of text." + match = date_pattern.search(text) + if match and is_valid_date( + match[1], "%Y-%m-%d", earliest=min_date, latest=max_date + ): + LOGGER.debug("regex found: %s %s", date_pattern, match[0]) + return convert_date(match[1], "%Y-%m-%d", outputformat) + return None + + def json_search( tree: HtmlElement, outputformat: str, @@ -479,25 +496,7 @@ def json_search( ): if not elem.text or '"date' not in elem.text: continue - json_match = json_pattern.search(elem.text) - if json_match and is_valid_date( - json_match[1], "%Y-%m-%d", earliest=min_date, latest=max_date - ): - LOGGER.debug("JSON time found: %s", json_match[0]) - return convert_date(json_match[1], "%Y-%m-%d", outputformat) - return None - - -def timestamp_search( - htmlstring: str, outputformat: str, min_date: datetime, max_date: datetime -) -> Optional[str]: - """Look for timestamps throughout the web page""" - tstamp_match = TIMESTAMP_PATTERN.search(htmlstring) - if tstamp_match and is_valid_date( - tstamp_match[1], "%Y-%m-%d", earliest=min_date, latest=max_date - ): - LOGGER.debug("time regex found: %s", tstamp_match[0]) - return convert_date(tstamp_match[1], "%Y-%m-%d", outputformat) + return pattern_search(elem.text, json_pattern, outputformat, min_date, max_date) return None