diff --git a/htmldate/core.py b/htmldate/core.py
index 4ecb3951..c1a4f8cb 100644
--- a/htmldate/core.py
+++ b/htmldate/core.py
@@ -205,8 +205,7 @@ def examine_date_elements(
elements = tree.xpath(expression)
if not elements or len(elements) > MAX_POSSIBLE_CANDIDATES:
return None
- # loop through the elements to analyze
- attempt = None
+
for elem in elements:
# trim
text = elem.text_content().strip()
@@ -222,19 +221,19 @@ def examine_date_elements(
attempt = try_date_expr(
text, outputformat, extensive_search, min_date, max_date
)
- if attempt is not None:
- break
+ if attempt:
+ return attempt
# try link title (Blogspot)
title_attr = elem.get("title", "").strip()
- if title_attr is not None and len(title_attr) > 0:
+ if len(title_attr) > 0:
title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_TEXT_SIZE])
attempt = try_date_expr(
title_attr, outputformat, extensive_search, min_date, max_date
)
- if attempt is not None:
- break
- # catchall
- return attempt
+ if attempt:
+ return attempt
+
+ return None
def examine_header(
@@ -282,8 +281,10 @@ def examine_header(
# loop through all meta elements
for elem in tree.iterfind(".//meta"):
# safeguard
- if not elem.attrib or (
- not "content" in elem.attrib and not "datetime" in elem.attrib
+ if (
+ not elem.attrib
+ or "content" not in elem.attrib
+ and "datetime" not in elem.attrib
):
continue
# name attribute, most frequent
@@ -383,54 +384,50 @@ def select_candidate(
max_date: datetime,
) -> Optional[Match[str]]:
"""Select a candidate among the most frequent matches"""
- match, year1, year2 = None, None, None
- # LOGGER.debug('occurrences: %s', occurrences)
- if len(occurrences) == 0 or len(occurrences) > MAX_POSSIBLE_CANDIDATES:
+ if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES:
return None
+
if len(occurrences) == 1:
- match = catch.search(list(occurrences.keys())[0])
+ match = catch.search(next(iter(occurrences)))
if match:
return match
- # select among most frequent
+
+ # select among most frequent: more than 10? more than 2 candidates?
firstselect = occurrences.most_common(10)
LOGGER.debug("firstselect: %s", firstselect)
# sort and find probable candidates
- if original_date:
- bestones = sorted(firstselect)[:2]
- else:
- bestones = sorted(firstselect, reverse=True)[:2]
-
- first_pattern, first_count = bestones[0][0], bestones[0][1]
- second_pattern, second_count = bestones[1][0], bestones[1][1]
+ bestones = sorted(firstselect, reverse=not original_date)[:2]
LOGGER.debug("bestones: %s", bestones)
+
# plausibility heuristics
- validation1, validation2 = False, False
- match1 = yearpat.search(first_pattern)
- if match1 is not None:
- year1 = match1[1]
- validation1 = date_validator(year1, "%Y", earliest=min_date, latest=max_date)
- match2 = yearpat.search(second_pattern)
- if match2 is not None:
- year2 = match2[1]
- validation2 = date_validator(year2, "%Y", earliest=min_date, latest=max_date)
+ patterns, counts = zip(*bestones)
+ years = [""] * len(bestones)
+ validation = [False] * len(bestones)
+ for i, pattern in enumerate(patterns):
+ year_match = yearpat.search(pattern)
+ if year_match:
+ years[i] = year_match[1]
+ dateobject = datetime(int(year_match[1]), 1, 1)
+ validation[i] = date_validator(
+ dateobject, "%Y", earliest=min_date, latest=max_date
+ )
+
# safety net: plausibility
- if validation1 is True and validation2 is True:
+ match = None
+ if all(validation):
# same number of occurrences: always take top of the pile?
- if first_count == second_count:
- match = catch.search(first_pattern)
+ if counts[0] == counts[1]:
+ match = catch.search(patterns[0])
# safety net: newer date but up to 50% less frequent
- elif year2 != year1 and second_count / first_count > 0.5:
- match = catch.search(second_pattern)
+ elif years[1] != years[0] and counts[1] / counts[0] > 0.5:
+ match = catch.search(patterns[1])
# not newer or hopefully not significant
else:
- match = catch.search(first_pattern)
- elif validation1 is False and validation2 is True:
- match = catch.search(second_pattern)
- elif validation1 is True and validation2 is False:
- match = catch.search(first_pattern)
+ match = catch.search(patterns[0])
+ elif any(validation):
+ match = catch.search(patterns[validation.index(True)])
else:
- LOGGER.debug("no suitable candidate: %s %s", year1, year2)
- return None
+ LOGGER.debug("no suitable candidate: %s %s", years[0], years[1])
return match
@@ -697,12 +694,13 @@ def search_page(
)
if bestmatch is not None:
LOGGER.debug("Copyright detected: %s", bestmatch[0])
+ dateobject = datetime(int(bestmatch[0]), 1, 1)
if (
date_validator(bestmatch[0], "%Y", earliest=min_date, latest=max_date)
is True
):
LOGGER.debug("copyright year/footer pattern found: %s", bestmatch[0])
- copyear = int(bestmatch[0])
+ copyear = dateobject.year
# 3 components
LOGGER.debug("3 components")
@@ -853,12 +851,17 @@ def search_page(
max_date,
)
if bestmatch is not None:
- pagedate = "-".join([bestmatch[1], bestmatch[2], "01"])
+ dateobject = datetime(int(bestmatch[1]), int(bestmatch[2]), 1)
if date_validator(
- pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date
- ) is True and (copyear == 0 or int(bestmatch[1]) >= copyear):
- LOGGER.debug('date found for pattern "%s": %s', YYYYMM_PATTERN, pagedate)
- return convert_date(pagedate, "%Y-%m-%d", outputformat)
+ dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date
+ ) is True and (copyear == 0 or dateobject.year >= copyear):
+ LOGGER.debug(
+ 'date found for pattern "%s": %s, %s',
+ YYYYMM_PATTERN,
+ bestmatch[1],
+ bestmatch[2],
+ )
+ return dateobject.strftime(outputformat)
# 2 components, second option
candidates = plausible_year_filter(
@@ -896,16 +899,14 @@ def search_page(
return result
# try full-blown text regex on all HTML?
- dateobject = regex_parse(htmlstring)
+ dateobject = regex_parse(htmlstring) # type: ignore[assignment]
# todo: find all candidates and disambiguate?
if date_validator(
dateobject, outputformat, earliest=min_date, latest=max_date
- ) is True and (
- copyear == 0 or dateobject.year >= copyear # type: ignore[union-attr]
- ):
+ ) is True and (copyear == 0 or dateobject.year >= copyear):
try:
LOGGER.debug("regex result on HTML: %s", dateobject)
- return dateobject.strftime(outputformat) # type: ignore
+ return dateobject.strftime(outputformat)
except ValueError as err:
LOGGER.error("value error during conversion: %s %s", dateobject, err)
@@ -928,14 +929,16 @@ def search_page(
max_date,
)
if bestmatch is not None:
- pagedate = "-".join([bestmatch[0], "01", "01"])
+ dateobject = datetime(int(bestmatch[0]), 1, 1)
if (
- date_validator(pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date)
+ date_validator(dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date)
is True
- and int(bestmatch[0]) >= copyear
+ and int(dateobject.year) >= copyear
):
- LOGGER.debug('date found for pattern "%s": %s', SIMPLE_PATTERN, pagedate)
- return convert_date(pagedate, "%Y-%m-%d", outputformat)
+ LOGGER.debug(
+ 'date found for pattern "%s": %s', SIMPLE_PATTERN, bestmatch[0]
+ )
+ return dateobject.strftime(outputformat)
return None
diff --git a/htmldate/extractors.py b/htmldate/extractors.py
index d64bb306..984386dd 100644
--- a/htmldate/extractors.py
+++ b/htmldate/extractors.py
@@ -53,32 +53,36 @@
SLOW_PREPEND = ".//*"
DATE_EXPRESSIONS = """
- [contains(translate(@id, "D", "d"), 'date')
- or contains(translate(@class, "D", "d"), 'date')
- or contains(translate(@itemprop, "D", "d"), 'date')
- or contains(translate(@id, "D", "d"), 'datum')
- or contains(translate(@class, "D", "d"), 'datum')
- or contains(@id, 'time') or contains(@class, 'time')
- or @class='meta' or contains(translate(@id, "M", "m"), 'metadata')
- or contains(translate(@class, "M", "m"), 'meta-')
- or contains(translate(@class, "M", "m"), '-meta')
- or contains(translate(@id, "M", "m"), '-meta')
- or contains(translate(@class, "M", "m"), '_meta')
- or contains(translate(@class, "M", "m"), 'postmeta')
- or contains(@class, 'info') or contains(@class, 'post_detail')
- or contains(@class, 'block-content')
- or contains(@class, 'byline') or contains(@class, 'subline')
- or contains(@class, 'posted') or contains(@class, 'submitted')
- or contains(@class, 'created-post')
- or contains(@id, 'publish') or contains(@class, 'publish')
- or contains(@class, 'publication')
- or contains(@class, 'author') or contains(@class, 'autor')
- or contains(@class, 'field-content')
- or contains(@class, 'fa-clock-o') or contains(@class, 'fa-calendar')
- or contains(@class, 'fecha') or contains(@class, 'parution')
- or contains(@class, 'footer') or contains(@id, 'footer')]
- |
- .//footer|.//small
+[
+ contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or
+ contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or
+ contains(@id|@class, 'time') or
+ @class='meta' or
+ contains(translate(@id|@class, "M", "m"), 'metadata') or
+ contains(translate(@id|@class, "M", "m"), 'meta-') or
+ contains(translate(@id|@class, "M", "m"), '-meta') or
+ contains(translate(@id|@class, "M", "m"), '_meta') or
+ contains(translate(@id|@class, "M", "m"), 'postmeta') or
+ contains(@id|@class, 'publish') or
+ contains(@id|@class, 'footer') or
+ contains(@class, 'info') or
+ contains(@class, 'post_detail') or
+ contains(@class, 'block-content') or
+ contains(@class, 'byline') or
+ contains(@class, 'subline') or
+ contains(@class, 'posted') or
+ contains(@class, 'submitted') or
+ contains(@class, 'created-post') or
+ contains(@class, 'publication') or
+ contains(@class, 'author') or
+ contains(@class, 'autor') or
+ contains(@class, 'field-content') or
+ contains(@class, 'fa-clock-o') or
+ contains(@class, 'fa-calendar') or
+ contains(@class, 'fecha') or
+ contains(@class, 'parution')
+] |
+.//footer | .//small
"""
# further tests needed:
# or contains(@class, 'article')
@@ -97,12 +101,12 @@
# regex cache
YMD_NO_SEP_PATTERN = re.compile(r"\b(\d{8})\b")
YMD_PATTERN = re.compile(
- r"(?:\D|^)(?P\d{4})[\-/.](?P\d{1,2})[\-/.](?P\d{1,2})(?:\D|$)|"
- r"(?:\D|^)(?P\d{1,2})[\-/.](?P\d{1,2})[\-/.](?P\d{2,4})(?:\D|$)"
+ r"(?:\D|^)(?:(?P\d{4})[\-/.](?P\d{1,2})[\-/.](?P\d{1,2})|"
+ r"(?P\d{1,2})[\-/.](?P\d{1,2})[\-/.](?P\d{2,4}))(?:\D|$)"
)
YM_PATTERN = re.compile(
- r"(?:\D|^)(?P\d{4})[\-/.](?P\d{1,2})(?:\D|$)|"
- r"(?:\D|^)(?P\d{1,2})[\-/.](?P\d{4})(?:\D|$)"
+ r"(?:\D|^)(?:(?P\d{4})[\-/.](?P\d{1,2})|"
+ r"(?P\d{1,2})[\-/.](?P\d{4}))(?:\D|$)"
)
REGEX_MONTHS = """
@@ -116,7 +120,8 @@
""" # todo: check "août"
LONG_TEXT_PATTERN = re.compile(
rf"""(?P{REGEX_MONTHS})\s
-(?P[0-9]{{1,2}})(?:st|nd|rd|th)?,? (?P[0-9]{{4}})|(?P[0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )?
+(?P[0-9]{{1,2}})(?:st|nd|rd|th)?,? (?P[0-9]{{4}})|
+(?P[0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )?
(?P{REGEX_MONTHS})[,.]? (?P[0-9]{{4}})""".replace(
"\n", ""
),
@@ -133,95 +138,23 @@
)
# English, French, German, Indonesian and Turkish dates cache
+MONTHS = [
+ ("jan", "januar", "jänner", "january", "januari", "janvier", "ocak", "oca"),
+ ("feb", "februar", "feber", "february", "februari", "février", "şubat", "şub"),
+ ("mar", "märz", "march", "maret", "mart", "mars"),
+ ("apr", "april", "avril", "nisan", "nis"),
+ ("may", "mai", "mei", "mayıs"),
+ ("jun", "juni", "june", "juin", "haziran", "haz"),
+ ("jul", "juli", "july", "juillet", "temmuz", "tem"),
+ ("aug", "august", "agustus", "ağustos", "ağu", "aout"),
+ ("sep", "september", "septembre", "eylül", "eyl"),
+ ("oct", "oktober", "october", "octobre", "okt", "ekim", "eki"),
+ ("nov", "november", "kasım", "kas", "novembre"),
+ ("dec", "dezember", "december", "desember", "décembre", "aralık", "ara"),
+]
+
TEXT_MONTHS = {
- # January
- "januar": "01",
- "jänner": "01",
- "january": "01",
- "januari": "01",
- "janvier": "01",
- "jan": "01",
- "ocak": "01",
- "oca": "01",
- # February
- "februar": "02",
- "feber": "02",
- "february": "02",
- "februari": "02",
- "février": "02",
- "feb": "02",
- "şubat": "02",
- "şub": "02",
- # March
- "märz": "03",
- "march": "03",
- "maret": "03",
- "mar": "03",
- "mär": "03",
- "mart": "03",
- "mars": "03",
- # April
- "april": "04",
- "apr": "04",
- "avril": "04",
- "nisan": "04",
- "nis": "04",
- # May
- "mai": "05",
- "may": "05",
- "mei": "05",
- "mayıs": "05",
- # June
- "juni": "06",
- "june": "06",
- "juin": "06",
- "jun": "06",
- "haziran": "06",
- "haz": "06",
- # July
- "juli": "07",
- "july": "07",
- "juillet": "07",
- "jul": "07",
- "temmuz": "07",
- "tem": "07",
- # August
- "august": "08",
- "agustus": "08",
- "aug": "08",
- "ağustos": "08",
- "ağu": "08",
- "aout": "08",
- # "août": "08",
- # September
- "september": "09",
- "septembre": "09",
- "sep": "09",
- "eylül": "09",
- "eyl": "09",
- # October
- "oktober": "10",
- "october": "10",
- "octobre": "10",
- "oct": "10",
- "okt": "10",
- "ekim": "10",
- "eki": "10",
- # November
- "november": "11",
- "nov": "11",
- "kasım": "11",
- "kas": "11",
- "novembre": "11",
- # December
- "dezember": "12",
- "december": "12",
- "desember": "12",
- "décembre": "12",
- "dec": "12",
- "dez": "12",
- "aralık": "12",
- "ara": "12",
+ month: mnum for mnum, mlist in enumerate(MONTHS, start=1) for month in mlist
}
TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")
@@ -231,7 +164,12 @@
# leads to errors: \D+\d{3,}\D+
DISCARD_PATTERNS = re.compile(
- r"[$€¥Ұ£¢₽₱฿#]|CNY|EUR|GBP|JPY|USD|http|\.(com|net|org)|IBAN|\+\d{2}\b"
+ r"[$€¥Ұ£¢₽₱฿#]|" # currency symbols
+ r"CNY|EUR|GBP|JPY|USD|" # currency codes
+ r"http|" # protocols
+ r"\.(com|net|org)|" # TLDs
+ r"IBAN|" # bank accountrs
+ r"\+\d{2}\b" # amounts/telephone numbers
)
# further testing required:
# \d[,.]\d+ # currency amounts
@@ -353,18 +291,16 @@ def regex_parse(string: str) -> Optional[datetime]:
return None
# process and return
try:
- if match.lastgroup == "year":
- day, month, year = (
- int(match.group("day")),
- int(TEXT_MONTHS[match.group("month").lower().strip(".")]),
- int(match.group("year")),
- )
- else:
- day, month, year = (
- int(match.group("day2")),
- int(TEXT_MONTHS[match.group("month2").lower().strip(".")]),
- int(match.group("year2")),
- )
+ groups = (
+ ("day", "month", "year")
+ if match.lastgroup == "year"
+ else ("day2", "month2", "year2")
+ )
+ day, month, year = (
+ int(match.group(groups[0])),
+ int(TEXT_MONTHS[match.group(groups[1]).lower().strip(".")]),
+ int(match.group(groups[2])),
+ )
year = correct_year(year)
day, month = try_swap_values(day, month)
dateobject = datetime(year, month, day)
@@ -431,14 +367,12 @@ def custom_parse(
match = YMD_PATTERN.search(string)
if match:
try:
- # YMD
if match.lastgroup == "day":
- candidate = datetime(
+ year, month, day = (
int(match.group("year")),
int(match.group("month")),
int(match.group("day")),
)
- # DMY
else:
day, month, year = (
int(match.group("day2")),
@@ -447,15 +381,13 @@ def custom_parse(
)
year = correct_year(year)
day, month = try_swap_values(day, month)
- candidate = datetime(year, month, day)
+
+ candidate = datetime(year, month, day)
except ValueError:
LOGGER.debug("regex value error: %s", match[0])
else:
- if (
- date_validator(
- candidate, "%Y-%m-%d", earliest=min_date, latest=max_date
- )
- is True
+ if date_validator(
+ candidate, "%Y-%m-%d", earliest=min_date, latest=max_date
):
LOGGER.debug("regex match: %s", candidate)
return candidate.strftime(outputformat)
@@ -608,33 +540,23 @@ def idiosyncrasies_search(
htmlstring: str, outputformat: str, min_date: datetime, max_date: datetime
) -> Optional[str]:
"""Look for author-written dates throughout the web page"""
- # EN+DE+TR
- match = TEXT_PATTERNS.search(htmlstring)
- if not match:
- return None
-
- parts = list(filter(None, match.groups()))
- if len(parts) != 3:
- return None
-
- candidate = None
-
- if len(parts[0]) == 4:
- candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2]))
- elif len(parts[2]) in (2, 4):
- # DD/MM/YY
- day, month = try_swap_values(int(parts[0]), int(parts[1]))
- year = correct_year(int(parts[2]))
- try:
- candidate = datetime(year, month, day)
- except ValueError:
- LOGGER.debug("value error in idiosyncrasies: %s", match[0])
-
- if (
- date_validator(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date)
- is True
- ):
- LOGGER.debug("idiosyncratic pattern found: %s", match[0])
- return candidate.strftime(outputformat) # type: ignore[union-attr]
-
+ match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR
+ if match:
+ parts = list(filter(None, match.groups()))
+ if len(parts) == 3:
+ candidate = None
+ if len(parts[0]) == 4:
+ candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2]))
+ elif len(parts[2]) in (2, 4):
+ # DD/MM/YY
+ day, month = try_swap_values(int(parts[0]), int(parts[1]))
+ year = correct_year(int(parts[2]))
+ try:
+ candidate = datetime(year, month, day)
+ except ValueError:
+ LOGGER.debug("value error in idiosyncrasies: %s", match[0])
+ if date_validator(
+ candidate, "%Y-%m-%d", earliest=min_date, latest=max_date
+ ):
+ return candidate.strftime(outputformat) # type: ignore[union-attr]
return None
diff --git a/htmldate/validators.py b/htmldate/validators.py
index 52144255..aa4ff029 100644
--- a/htmldate/validators.py
+++ b/htmldate/validators.py
@@ -48,12 +48,9 @@ def date_validator(
return False
else:
dateobject = date_input
- # basic year validation
- year = int(datetime.strftime(dateobject, "%Y"))
- min_year, max_year = earliest.year, latest.year
- # full validation: not newer than today or stored variable
+ # year first, then full validation: not newer than today or stored variable
if (
- min_year <= year <= max_year
+ earliest.year <= dateobject.year <= latest.year
and earliest.timestamp() <= dateobject.timestamp() <= latest.timestamp()
):
return True
@@ -89,34 +86,26 @@ def plausible_year_filter(
incomplete: bool = False,
) -> Counter_Type[str]:
"""Filter the date patterns to find plausible years only"""
- # slow!
- occurrences = Counter(pattern.findall(htmlstring))
- toremove = set()
- # LOGGER.debug('occurrences: %s', occurrences)
- # look for implausible dates
- for item in occurrences.keys():
- # scrap implausible dates
+ occurrences = Counter(pattern.findall(htmlstring)) # slow!
+
+ for item in list(occurrences):
year_match = yearpat.search(item)
- if year_match is not None:
- if not incomplete:
- potential_year = int(year_match[1])
- else:
- lastdigits = year_match[1]
- if lastdigits[0] == "9":
- potential_year = int("19" + lastdigits)
- else:
- potential_year = int("20" + lastdigits)
- if not earliest.year <= potential_year <= latest.year:
- LOGGER.debug("no potential year: %s", item)
- toremove.add(item)
- # occurrences.remove(item)
- # continue
- else:
+ if year_match is None:
LOGGER.debug("not a year pattern: %s", item)
- toremove.add(item)
- # remove candidates
- for item in toremove:
- del occurrences[item]
+ del occurrences[item]
+ continue
+
+ lastdigits = year_match[1]
+ if not incomplete:
+ potential_year = int(lastdigits)
+ else:
+ century = "19" if lastdigits[0] == "9" else "20"
+ potential_year = int(century + lastdigits)
+
+ if not earliest.year <= potential_year <= latest.year:
+ LOGGER.debug("no potential year: %s", item)
+ del occurrences[item]
+
return occurrences
@@ -202,7 +191,7 @@ def check_date_input(
return date_object
if isinstance(date_object, str):
try:
- return datetime.fromisoformat(date_object) # type: ignore
+ return datetime.fromisoformat(date_object) # type: ignore[attr-defined]
except ValueError:
LOGGER.warning("invalid datetime string: %s", date_object)
return default # no input or error thrown