Skip to content

Commit

Permalink
extensive search: regex_parse on whole document (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 15, 2022
1 parent dffa53b commit 522db7d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 17 deletions.
31 changes: 16 additions & 15 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
idiosyncrasies_search,
img_search,
json_search,
# regex_parse,
regex_parse,
timestamp_search,
try_date_expr,
DATE_EXPRESSIONS,
Expand Down Expand Up @@ -846,27 +846,28 @@ def search_page(
if result is not None:
return result

# try full-blown text regex on all HTML?
dateobject = regex_parse(htmlstring)
# todo: find all candidates and disambiguate?
if date_validator(
dateobject, outputformat, earliest=min_date, latest=max_date
) is True and (
copyear == 0 or dateobject.year >= copyear # type: ignore[union-attr]
):
try:
LOGGER.debug("regex result on HTML: %s", dateobject)
return dateobject.strftime(outputformat) # type: ignore
except ValueError as err:
LOGGER.error("value error during conversion: %s %s", dateobject, err)

# catchall: copyright mention
if copyear != 0:
LOGGER.debug("using copyright year as default")
return convert_date(
"-".join([str(copyear), "01", "01"]), "%Y-%m-%d", outputformat
)

# try full-blown text regex on all HTML?
# dateobject = regex_parse(htmlstring)
# todo: find all candidates and disambiguate?
# if (
# date_validator(dateobject, outputformat, earliest=min_date, latest=max_date)
# is True
# ):
# try:
# LOGGER.debug("regex result on HTML: %s", dateobject)
# return dateobject.strftime(outputformat) # type: ignore
# except ValueError as err:
# LOGGER.error("value error during conversion: %s %s", string, err)

# 1 component, last try
# last resort: 1 component
LOGGER.debug("switching to one component")
bestmatch = search_pattern(
htmlstring,
Expand Down
3 changes: 1 addition & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,7 @@ def test_exact_date():
'<html><body><time class="Feed-module--feed__item-meta-time--3t1fg" dateTime="November 29, 2020">November 2020</time></body></html>',
outputformat="%Y-%m-%d %H:%m:%S",
)
== "2020-01-01 00:01:00" # solution so far
# == "2020-11-29 00:11:00" # better
== "2020-11-29 00:11:00" # 00:11 unclear
)
assert (
find_date(
Expand Down

0 comments on commit 522db7d

Please sign in to comment.