diff --git a/htmldate/core.py b/htmldate/core.py index f6fe7bb4..a0546a52 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -218,19 +218,17 @@ def examine_date_elements( "analyzing (HTML): %s", " ".join(logstring(elem).split())[:100], ) - attempt = try_date_expr( + if attempt := try_date_expr( text, outputformat, extensive_search, min_date, max_date - ) - if attempt: + ): return attempt # try link title (Blogspot) title_attr = trim_text(elem.get("title", "")) if len(title_attr) > MIN_SEGMENT_LEN: title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_SEGMENT_LEN]) - attempt = try_date_expr( + if attempt := try_date_expr( title_attr, outputformat, extensive_search, min_date, max_date - ) - if attempt: + ): return attempt return None @@ -404,8 +402,7 @@ def select_candidate( years = [""] * len(bestones) validation = [False] * len(bestones) for i, pattern in enumerate(patterns): - year_match = yearpat.search(pattern) - if year_match: + if year_match := yearpat.search(pattern): years[i] = year_match[1] dateobject = datetime(int(year_match[1]), 1, 1) validation[i] = date_validator( @@ -642,7 +639,7 @@ def normalize_match(match: Optional[Match[str]]) -> str: and optionally expand the year from two to four digits.""" day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr] if len(year) == 2: - year = "19" + year if year[0] == "9" else "20" + year + year = f"19{year}" if year[0] == "9" else f"20{year}" return f"{year}-{month}-{day}" @@ -870,7 +867,7 @@ def search_page( match = TWO_COMP_REGEX.match(item) month = match[1] # type: ignore[index] if len(month) == 1: - month = "0" + month + month = f"0{month}" candidate = "-".join([match[2], month, "01"]) # type: ignore[index] replacement[candidate] = candidates[item] candidates = Counter(replacement) @@ -923,9 +920,11 @@ def search_page( if bestmatch is not None: dateobject = datetime(int(bestmatch[0]), 1, 1) if ( - date_validator(dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date) + date_validator( + dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date + ) is True - and int(dateobject.year) >= copyear + and dateobject.year >= copyear ): LOGGER.debug( 'date found for pattern "%s": %s', SIMPLE_PATTERN, bestmatch[0] diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 02ec01fd..5a7a1531 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -3,6 +3,7 @@ Custom parsers and XPath expressions for date extraction """ + ## This file is available from https://github.com/adbar/htmldate ## under GNU GPL v3 license @@ -85,7 +86,7 @@ # or contains(@class, 'article') # or contains(@id, 'lastmod') or contains(@class, 'updated') -FREE_TEXT_EXPRESSIONS = FAST_PREPEND + "/text()" +FREE_TEXT_EXPRESSIONS = f"{FAST_PREPEND}/text()" MIN_SEGMENT_LEN = 6 MAX_SEGMENT_LEN = 52 @@ -230,8 +231,7 @@ def extract_url_date( testurl: str, outputformat: str, min_date: datetime, max_date: datetime ) -> Optional[str]: """Extract the date out of an URL string complying with the Y-M-D format""" - match = COMPLETE_URL.search(testurl) - if match: + if match := COMPLETE_URL.search(testurl): LOGGER.debug("found date in URL: %s", match[0]) try: dateobject = datetime(int(match[1]), int(match[2]), int(match[3])) @@ -326,9 +326,7 @@ def custom_parse( LOGGER.debug("parsing result: %s", candidate) return candidate.strftime(outputformat) - # 2. Try YYYYMMDD, use regex - match = YMD_NO_SEP_PATTERN.search(string) - if match: + if match := YMD_NO_SEP_PATTERN.search(string): try: year, month, day = int(match[1][:4]), int(match[1][4:6]), int(match[1][6:8]) candidate = datetime(year, month, day) @@ -344,9 +342,7 @@ def custom_parse( LOGGER.debug("YYYYMMDD match: %s", candidate) return candidate.strftime(outputformat) - # 3. Try the very common YMD, Y-M-D, and D-M-Y patterns - match = YMD_PATTERN.search(string) - if match: + if match := YMD_PATTERN.search(string): try: if match.lastgroup == "day": year, month, day = ( @@ -373,9 +369,7 @@ def custom_parse( LOGGER.debug("regex match: %s", candidate) return candidate.strftime(outputformat) - # 4. Try the Y-M and M-Y patterns - match = YM_PATTERN.search(string) - if match: + if match := YM_PATTERN.search(string): try: if match.lastgroup == "month": candidate = datetime( @@ -524,14 +518,13 @@ def idiosyncrasies_search( htmlstring: str, outputformat: str, min_date: datetime, max_date: datetime ) -> Optional[str]: """Look for author-written dates throughout the web page""" - match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR - if match: + if match := TEXT_PATTERNS.search(htmlstring): parts = list(filter(None, match.groups())) if len(parts) == 3: candidate = None if len(parts[0]) == 4: candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2])) - elif len(parts[2]) in (2, 4): + elif len(parts[2]) in {2, 4}: # DD/MM/YY day, month = try_swap_values(int(parts[0]), int(parts[1])) year = correct_year(int(parts[2]))