Skip to content

Commit

Permalink
simplify further
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 26, 2023
1 parent 69a1e4f commit beef551
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 40 deletions.
38 changes: 18 additions & 20 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
img_search,
json_search,
regex_parse,
timestamp_search,
pattern_search,
try_date_expr,
DATE_EXPRESSIONS,
FAST_PREPEND,
Expand All @@ -37,6 +37,7 @@
YEAR_PATTERN,
YMD_PATTERN,
COPYRIGHT_PATTERN,
TIMESTAMP_PATTERN,
THREE_PATTERN,
THREE_CATCH,
THREE_LOOSE_PATTERN,
Expand Down Expand Up @@ -201,14 +202,13 @@ def examine_text(
max_date: datetime,
) -> Optional[str]:
"Prepare text and try to extract a date."
attempt = None
text = trim_text(text)
if len(text) > MIN_SEGMENT_LEN:
text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN])
attempt = try_date_expr(
text, outputformat, extensive_search, min_date, max_date
)
return attempt

if len(text) <= MIN_SEGMENT_LEN:
return None

text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN])
return try_date_expr(text, outputformat, extensive_search, min_date, max_date)


def examine_date_elements(
Expand All @@ -225,17 +225,13 @@ def examine_date_elements(
return None

for elem in elements:
attempt = examine_text(
elem.text_content(), outputformat, extensive_search, min_date, max_date
)
if attempt:
return attempt
# try link title (Blogspot)
attempt = examine_text(
elem.get("title", ""), outputformat, extensive_search, min_date, max_date
)
if attempt:
return attempt
# try element text and link title (Blogspot)
for text in [elem.text_content(), elem.get("title", "")]:
attempt = examine_text(
text, outputformat, extensive_search, min_date, max_date
)
if attempt:
return attempt

return None

Expand Down Expand Up @@ -1091,7 +1087,9 @@ def find_date(
htmlstring = tostring(search_tree, pretty_print=False).decode("utf-8", "ignore")

# date regex timestamp rescue
timestamp_result = timestamp_search(htmlstring, outputformat, min_date, max_date)
timestamp_result = pattern_search(
htmlstring, TIMESTAMP_PATTERN, outputformat, min_date, max_date
)
if timestamp_result is not None:
return timestamp_result

Expand Down
39 changes: 19 additions & 20 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from datetime import datetime
from functools import lru_cache
from typing import List, Optional, Tuple
from typing import List, Optional, Pattern, Tuple

# coverage for date parsing
from dateparser import DateDataParser # type: ignore # third-party, slow
Expand Down Expand Up @@ -463,6 +463,23 @@ def img_search(
return None


def pattern_search(
text: str,
date_pattern: Pattern[str],
outputformat: str,
min_date: datetime,
max_date: datetime,
) -> Optional[str]:
"Look for date expressions using a regular expression on a string of text."
match = date_pattern.search(text)
if match and is_valid_date(
match[1], "%Y-%m-%d", earliest=min_date, latest=max_date
):
LOGGER.debug("regex found: %s %s", date_pattern, match[0])
return convert_date(match[1], "%Y-%m-%d", outputformat)
return None


def json_search(
tree: HtmlElement,
outputformat: str,
Expand All @@ -479,25 +496,7 @@ def json_search(
):
if not elem.text or '"date' not in elem.text:
continue
json_match = json_pattern.search(elem.text)
if json_match and is_valid_date(
json_match[1], "%Y-%m-%d", earliest=min_date, latest=max_date
):
LOGGER.debug("JSON time found: %s", json_match[0])
return convert_date(json_match[1], "%Y-%m-%d", outputformat)
return None


def timestamp_search(
htmlstring: str, outputformat: str, min_date: datetime, max_date: datetime
) -> Optional[str]:
"""Look for timestamps throughout the web page"""
tstamp_match = TIMESTAMP_PATTERN.search(htmlstring)
if tstamp_match and is_valid_date(
tstamp_match[1], "%Y-%m-%d", earliest=min_date, latest=max_date
):
LOGGER.debug("time regex found: %s", tstamp_match[0])
return convert_date(tstamp_match[1], "%Y-%m-%d", outputformat)
return pattern_search(elem.text, json_pattern, outputformat, min_date, max_date)
return None


Expand Down

0 comments on commit beef551

Please sign in to comment.