diff --git a/htmldate/core.py b/htmldate/core.py index 4ecb3951..c1a4f8cb 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -205,8 +205,7 @@ def examine_date_elements( elements = tree.xpath(expression) if not elements or len(elements) > MAX_POSSIBLE_CANDIDATES: return None - # loop through the elements to analyze - attempt = None + for elem in elements: # trim text = elem.text_content().strip() @@ -222,19 +221,19 @@ def examine_date_elements( attempt = try_date_expr( text, outputformat, extensive_search, min_date, max_date ) - if attempt is not None: - break + if attempt: + return attempt # try link title (Blogspot) title_attr = elem.get("title", "").strip() - if title_attr is not None and len(title_attr) > 0: + if len(title_attr) > 0: title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_TEXT_SIZE]) attempt = try_date_expr( title_attr, outputformat, extensive_search, min_date, max_date ) - if attempt is not None: - break - # catchall - return attempt + if attempt: + return attempt + + return None def examine_header( @@ -282,8 +281,10 @@ def examine_header( # loop through all meta elements for elem in tree.iterfind(".//meta"): # safeguard - if not elem.attrib or ( - not "content" in elem.attrib and not "datetime" in elem.attrib + if ( + not elem.attrib + or "content" not in elem.attrib + and "datetime" not in elem.attrib ): continue # name attribute, most frequent @@ -383,54 +384,50 @@ def select_candidate( max_date: datetime, ) -> Optional[Match[str]]: """Select a candidate among the most frequent matches""" - match, year1, year2 = None, None, None - # LOGGER.debug('occurrences: %s', occurrences) - if len(occurrences) == 0 or len(occurrences) > MAX_POSSIBLE_CANDIDATES: + if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES: return None + if len(occurrences) == 1: - match = catch.search(list(occurrences.keys())[0]) + match = catch.search(next(iter(occurrences))) if match: return match - # select among most frequent + + # select among most frequent: more than 10? more than 2 candidates? firstselect = occurrences.most_common(10) LOGGER.debug("firstselect: %s", firstselect) # sort and find probable candidates - if original_date: - bestones = sorted(firstselect)[:2] - else: - bestones = sorted(firstselect, reverse=True)[:2] - - first_pattern, first_count = bestones[0][0], bestones[0][1] - second_pattern, second_count = bestones[1][0], bestones[1][1] + bestones = sorted(firstselect, reverse=not original_date)[:2] LOGGER.debug("bestones: %s", bestones) + # plausibility heuristics - validation1, validation2 = False, False - match1 = yearpat.search(first_pattern) - if match1 is not None: - year1 = match1[1] - validation1 = date_validator(year1, "%Y", earliest=min_date, latest=max_date) - match2 = yearpat.search(second_pattern) - if match2 is not None: - year2 = match2[1] - validation2 = date_validator(year2, "%Y", earliest=min_date, latest=max_date) + patterns, counts = zip(*bestones) + years = [""] * len(bestones) + validation = [False] * len(bestones) + for i, pattern in enumerate(patterns): + year_match = yearpat.search(pattern) + if year_match: + years[i] = year_match[1] + dateobject = datetime(int(year_match[1]), 1, 1) + validation[i] = date_validator( + dateobject, "%Y", earliest=min_date, latest=max_date + ) + # safety net: plausibility - if validation1 is True and validation2 is True: + match = None + if all(validation): # same number of occurrences: always take top of the pile? - if first_count == second_count: - match = catch.search(first_pattern) + if counts[0] == counts[1]: + match = catch.search(patterns[0]) # safety net: newer date but up to 50% less frequent - elif year2 != year1 and second_count / first_count > 0.5: - match = catch.search(second_pattern) + elif years[1] != years[0] and counts[1] / counts[0] > 0.5: + match = catch.search(patterns[1]) # not newer or hopefully not significant else: - match = catch.search(first_pattern) - elif validation1 is False and validation2 is True: - match = catch.search(second_pattern) - elif validation1 is True and validation2 is False: - match = catch.search(first_pattern) + match = catch.search(patterns[0]) + elif any(validation): + match = catch.search(patterns[validation.index(True)]) else: - LOGGER.debug("no suitable candidate: %s %s", year1, year2) - return None + LOGGER.debug("no suitable candidate: %s %s", years[0], years[1]) return match @@ -697,12 +694,13 @@ def search_page( ) if bestmatch is not None: LOGGER.debug("Copyright detected: %s", bestmatch[0]) + dateobject = datetime(int(bestmatch[0]), 1, 1) if ( date_validator(bestmatch[0], "%Y", earliest=min_date, latest=max_date) is True ): LOGGER.debug("copyright year/footer pattern found: %s", bestmatch[0]) - copyear = int(bestmatch[0]) + copyear = dateobject.year # 3 components LOGGER.debug("3 components") @@ -853,12 +851,17 @@ def search_page( max_date, ) if bestmatch is not None: - pagedate = "-".join([bestmatch[1], bestmatch[2], "01"]) + dateobject = datetime(int(bestmatch[1]), int(bestmatch[2]), 1) if date_validator( - pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date - ) is True and (copyear == 0 or int(bestmatch[1]) >= copyear): - LOGGER.debug('date found for pattern "%s": %s', YYYYMM_PATTERN, pagedate) - return convert_date(pagedate, "%Y-%m-%d", outputformat) + dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date + ) is True and (copyear == 0 or dateobject.year >= copyear): + LOGGER.debug( + 'date found for pattern "%s": %s, %s', + YYYYMM_PATTERN, + bestmatch[1], + bestmatch[2], + ) + return dateobject.strftime(outputformat) # 2 components, second option candidates = plausible_year_filter( @@ -896,16 +899,14 @@ def search_page( return result # try full-blown text regex on all HTML? - dateobject = regex_parse(htmlstring) + dateobject = regex_parse(htmlstring) # type: ignore[assignment] # todo: find all candidates and disambiguate? if date_validator( dateobject, outputformat, earliest=min_date, latest=max_date - ) is True and ( - copyear == 0 or dateobject.year >= copyear # type: ignore[union-attr] - ): + ) is True and (copyear == 0 or dateobject.year >= copyear): try: LOGGER.debug("regex result on HTML: %s", dateobject) - return dateobject.strftime(outputformat) # type: ignore + return dateobject.strftime(outputformat) except ValueError as err: LOGGER.error("value error during conversion: %s %s", dateobject, err) @@ -928,14 +929,16 @@ def search_page( max_date, ) if bestmatch is not None: - pagedate = "-".join([bestmatch[0], "01", "01"]) + dateobject = datetime(int(bestmatch[0]), 1, 1) if ( - date_validator(pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date) + date_validator(dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date) is True - and int(bestmatch[0]) >= copyear + and int(dateobject.year) >= copyear ): - LOGGER.debug('date found for pattern "%s": %s', SIMPLE_PATTERN, pagedate) - return convert_date(pagedate, "%Y-%m-%d", outputformat) + LOGGER.debug( + 'date found for pattern "%s": %s', SIMPLE_PATTERN, bestmatch[0] + ) + return dateobject.strftime(outputformat) return None diff --git a/htmldate/extractors.py b/htmldate/extractors.py index d64bb306..984386dd 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -53,32 +53,36 @@ SLOW_PREPEND = ".//*" DATE_EXPRESSIONS = """ - [contains(translate(@id, "D", "d"), 'date') - or contains(translate(@class, "D", "d"), 'date') - or contains(translate(@itemprop, "D", "d"), 'date') - or contains(translate(@id, "D", "d"), 'datum') - or contains(translate(@class, "D", "d"), 'datum') - or contains(@id, 'time') or contains(@class, 'time') - or @class='meta' or contains(translate(@id, "M", "m"), 'metadata') - or contains(translate(@class, "M", "m"), 'meta-') - or contains(translate(@class, "M", "m"), '-meta') - or contains(translate(@id, "M", "m"), '-meta') - or contains(translate(@class, "M", "m"), '_meta') - or contains(translate(@class, "M", "m"), 'postmeta') - or contains(@class, 'info') or contains(@class, 'post_detail') - or contains(@class, 'block-content') - or contains(@class, 'byline') or contains(@class, 'subline') - or contains(@class, 'posted') or contains(@class, 'submitted') - or contains(@class, 'created-post') - or contains(@id, 'publish') or contains(@class, 'publish') - or contains(@class, 'publication') - or contains(@class, 'author') or contains(@class, 'autor') - or contains(@class, 'field-content') - or contains(@class, 'fa-clock-o') or contains(@class, 'fa-calendar') - or contains(@class, 'fecha') or contains(@class, 'parution') - or contains(@class, 'footer') or contains(@id, 'footer')] - | - .//footer|.//small +[ + contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or + contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or + contains(@id|@class, 'time') or + @class='meta' or + contains(translate(@id|@class, "M", "m"), 'metadata') or + contains(translate(@id|@class, "M", "m"), 'meta-') or + contains(translate(@id|@class, "M", "m"), '-meta') or + contains(translate(@id|@class, "M", "m"), '_meta') or + contains(translate(@id|@class, "M", "m"), 'postmeta') or + contains(@id|@class, 'publish') or + contains(@id|@class, 'footer') or + contains(@class, 'info') or + contains(@class, 'post_detail') or + contains(@class, 'block-content') or + contains(@class, 'byline') or + contains(@class, 'subline') or + contains(@class, 'posted') or + contains(@class, 'submitted') or + contains(@class, 'created-post') or + contains(@class, 'publication') or + contains(@class, 'author') or + contains(@class, 'autor') or + contains(@class, 'field-content') or + contains(@class, 'fa-clock-o') or + contains(@class, 'fa-calendar') or + contains(@class, 'fecha') or + contains(@class, 'parution') +] | +.//footer | .//small """ # further tests needed: # or contains(@class, 'article') @@ -97,12 +101,12 @@ # regex cache YMD_NO_SEP_PATTERN = re.compile(r"\b(\d{8})\b") YMD_PATTERN = re.compile( - r"(?:\D|^)(?P\d{4})[\-/.](?P\d{1,2})[\-/.](?P\d{1,2})(?:\D|$)|" - r"(?:\D|^)(?P\d{1,2})[\-/.](?P\d{1,2})[\-/.](?P\d{2,4})(?:\D|$)" + r"(?:\D|^)(?:(?P\d{4})[\-/.](?P\d{1,2})[\-/.](?P\d{1,2})|" + r"(?P\d{1,2})[\-/.](?P\d{1,2})[\-/.](?P\d{2,4}))(?:\D|$)" ) YM_PATTERN = re.compile( - r"(?:\D|^)(?P\d{4})[\-/.](?P\d{1,2})(?:\D|$)|" - r"(?:\D|^)(?P\d{1,2})[\-/.](?P\d{4})(?:\D|$)" + r"(?:\D|^)(?:(?P\d{4})[\-/.](?P\d{1,2})|" + r"(?P\d{1,2})[\-/.](?P\d{4}))(?:\D|$)" ) REGEX_MONTHS = """ @@ -116,7 +120,8 @@ """ # todo: check "août" LONG_TEXT_PATTERN = re.compile( rf"""(?P{REGEX_MONTHS})\s -(?P[0-9]{{1,2}})(?:st|nd|rd|th)?,? (?P[0-9]{{4}})|(?P[0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )? +(?P[0-9]{{1,2}})(?:st|nd|rd|th)?,? (?P[0-9]{{4}})| +(?P[0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )? (?P{REGEX_MONTHS})[,.]? (?P[0-9]{{4}})""".replace( "\n", "" ), @@ -133,95 +138,23 @@ ) # English, French, German, Indonesian and Turkish dates cache +MONTHS = [ + ("jan", "januar", "jänner", "january", "januari", "janvier", "ocak", "oca"), + ("feb", "februar", "feber", "february", "februari", "février", "şubat", "şub"), + ("mar", "märz", "march", "maret", "mart", "mars"), + ("apr", "april", "avril", "nisan", "nis"), + ("may", "mai", "mei", "mayıs"), + ("jun", "juni", "june", "juin", "haziran", "haz"), + ("jul", "juli", "july", "juillet", "temmuz", "tem"), + ("aug", "august", "agustus", "ağustos", "ağu", "aout"), + ("sep", "september", "septembre", "eylül", "eyl"), + ("oct", "oktober", "october", "octobre", "okt", "ekim", "eki"), + ("nov", "november", "kasım", "kas", "novembre"), + ("dec", "dezember", "december", "desember", "décembre", "aralık", "ara"), +] + TEXT_MONTHS = { - # January - "januar": "01", - "jänner": "01", - "january": "01", - "januari": "01", - "janvier": "01", - "jan": "01", - "ocak": "01", - "oca": "01", - # February - "februar": "02", - "feber": "02", - "february": "02", - "februari": "02", - "février": "02", - "feb": "02", - "şubat": "02", - "şub": "02", - # March - "märz": "03", - "march": "03", - "maret": "03", - "mar": "03", - "mär": "03", - "mart": "03", - "mars": "03", - # April - "april": "04", - "apr": "04", - "avril": "04", - "nisan": "04", - "nis": "04", - # May - "mai": "05", - "may": "05", - "mei": "05", - "mayıs": "05", - # June - "juni": "06", - "june": "06", - "juin": "06", - "jun": "06", - "haziran": "06", - "haz": "06", - # July - "juli": "07", - "july": "07", - "juillet": "07", - "jul": "07", - "temmuz": "07", - "tem": "07", - # August - "august": "08", - "agustus": "08", - "aug": "08", - "ağustos": "08", - "ağu": "08", - "aout": "08", - # "août": "08", - # September - "september": "09", - "septembre": "09", - "sep": "09", - "eylül": "09", - "eyl": "09", - # October - "oktober": "10", - "october": "10", - "octobre": "10", - "oct": "10", - "okt": "10", - "ekim": "10", - "eki": "10", - # November - "november": "11", - "nov": "11", - "kasım": "11", - "kas": "11", - "novembre": "11", - # December - "dezember": "12", - "december": "12", - "desember": "12", - "décembre": "12", - "dec": "12", - "dez": "12", - "aralık": "12", - "ara": "12", + month: mnum for mnum, mlist in enumerate(MONTHS, start=1) for month in mlist } TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$") @@ -231,7 +164,12 @@ # leads to errors: \D+\d{3,}\D+ DISCARD_PATTERNS = re.compile( - r"[$€¥Ұ£¢₽₱฿#]|CNY|EUR|GBP|JPY|USD|http|\.(com|net|org)|IBAN|\+\d{2}\b" + r"[$€¥Ұ£¢₽₱฿#]|" # currency symbols + r"CNY|EUR|GBP|JPY|USD|" # currency codes + r"http|" # protocols + r"\.(com|net|org)|" # TLDs + r"IBAN|" # bank accountrs + r"\+\d{2}\b" # amounts/telephone numbers ) # further testing required: # \d[,.]\d+ # currency amounts @@ -353,18 +291,16 @@ def regex_parse(string: str) -> Optional[datetime]: return None # process and return try: - if match.lastgroup == "year": - day, month, year = ( - int(match.group("day")), - int(TEXT_MONTHS[match.group("month").lower().strip(".")]), - int(match.group("year")), - ) - else: - day, month, year = ( - int(match.group("day2")), - int(TEXT_MONTHS[match.group("month2").lower().strip(".")]), - int(match.group("year2")), - ) + groups = ( + ("day", "month", "year") + if match.lastgroup == "year" + else ("day2", "month2", "year2") + ) + day, month, year = ( + int(match.group(groups[0])), + int(TEXT_MONTHS[match.group(groups[1]).lower().strip(".")]), + int(match.group(groups[2])), + ) year = correct_year(year) day, month = try_swap_values(day, month) dateobject = datetime(year, month, day) @@ -431,14 +367,12 @@ def custom_parse( match = YMD_PATTERN.search(string) if match: try: - # YMD if match.lastgroup == "day": - candidate = datetime( + year, month, day = ( int(match.group("year")), int(match.group("month")), int(match.group("day")), ) - # DMY else: day, month, year = ( int(match.group("day2")), @@ -447,15 +381,13 @@ def custom_parse( ) year = correct_year(year) day, month = try_swap_values(day, month) - candidate = datetime(year, month, day) + + candidate = datetime(year, month, day) except ValueError: LOGGER.debug("regex value error: %s", match[0]) else: - if ( - date_validator( - candidate, "%Y-%m-%d", earliest=min_date, latest=max_date - ) - is True + if date_validator( + candidate, "%Y-%m-%d", earliest=min_date, latest=max_date ): LOGGER.debug("regex match: %s", candidate) return candidate.strftime(outputformat) @@ -608,33 +540,23 @@ def idiosyncrasies_search( htmlstring: str, outputformat: str, min_date: datetime, max_date: datetime ) -> Optional[str]: """Look for author-written dates throughout the web page""" - # EN+DE+TR - match = TEXT_PATTERNS.search(htmlstring) - if not match: - return None - - parts = list(filter(None, match.groups())) - if len(parts) != 3: - return None - - candidate = None - - if len(parts[0]) == 4: - candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2])) - elif len(parts[2]) in (2, 4): - # DD/MM/YY - day, month = try_swap_values(int(parts[0]), int(parts[1])) - year = correct_year(int(parts[2])) - try: - candidate = datetime(year, month, day) - except ValueError: - LOGGER.debug("value error in idiosyncrasies: %s", match[0]) - - if ( - date_validator(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date) - is True - ): - LOGGER.debug("idiosyncratic pattern found: %s", match[0]) - return candidate.strftime(outputformat) # type: ignore[union-attr] - + match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR + if match: + parts = list(filter(None, match.groups())) + if len(parts) == 3: + candidate = None + if len(parts[0]) == 4: + candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2])) + elif len(parts[2]) in (2, 4): + # DD/MM/YY + day, month = try_swap_values(int(parts[0]), int(parts[1])) + year = correct_year(int(parts[2])) + try: + candidate = datetime(year, month, day) + except ValueError: + LOGGER.debug("value error in idiosyncrasies: %s", match[0]) + if date_validator( + candidate, "%Y-%m-%d", earliest=min_date, latest=max_date + ): + return candidate.strftime(outputformat) # type: ignore[union-attr] return None diff --git a/htmldate/validators.py b/htmldate/validators.py index 52144255..aa4ff029 100644 --- a/htmldate/validators.py +++ b/htmldate/validators.py @@ -48,12 +48,9 @@ def date_validator( return False else: dateobject = date_input - # basic year validation - year = int(datetime.strftime(dateobject, "%Y")) - min_year, max_year = earliest.year, latest.year - # full validation: not newer than today or stored variable + # year first, then full validation: not newer than today or stored variable if ( - min_year <= year <= max_year + earliest.year <= dateobject.year <= latest.year and earliest.timestamp() <= dateobject.timestamp() <= latest.timestamp() ): return True @@ -89,34 +86,26 @@ def plausible_year_filter( incomplete: bool = False, ) -> Counter_Type[str]: """Filter the date patterns to find plausible years only""" - # slow! - occurrences = Counter(pattern.findall(htmlstring)) - toremove = set() - # LOGGER.debug('occurrences: %s', occurrences) - # look for implausible dates - for item in occurrences.keys(): - # scrap implausible dates + occurrences = Counter(pattern.findall(htmlstring)) # slow! + + for item in list(occurrences): year_match = yearpat.search(item) - if year_match is not None: - if not incomplete: - potential_year = int(year_match[1]) - else: - lastdigits = year_match[1] - if lastdigits[0] == "9": - potential_year = int("19" + lastdigits) - else: - potential_year = int("20" + lastdigits) - if not earliest.year <= potential_year <= latest.year: - LOGGER.debug("no potential year: %s", item) - toremove.add(item) - # occurrences.remove(item) - # continue - else: + if year_match is None: LOGGER.debug("not a year pattern: %s", item) - toremove.add(item) - # remove candidates - for item in toremove: - del occurrences[item] + del occurrences[item] + continue + + lastdigits = year_match[1] + if not incomplete: + potential_year = int(lastdigits) + else: + century = "19" if lastdigits[0] == "9" else "20" + potential_year = int(century + lastdigits) + + if not earliest.year <= potential_year <= latest.year: + LOGGER.debug("no potential year: %s", item) + del occurrences[item] + return occurrences @@ -202,7 +191,7 @@ def check_date_input( return date_object if isinstance(date_object, str): try: - return datetime.fromisoformat(date_object) # type: ignore + return datetime.fromisoformat(date_object) # type: ignore[attr-defined] except ValueError: LOGGER.warning("invalid datetime string: %s", date_object) return default # no input or error thrown