Skip to content

Commit

Permalink
'Refactored by Sourcery'
Browse files Browse the repository at this point in the history
  • Loading branch information
Sourcery AI committed Oct 26, 2023
1 parent 4dcf80d commit 0c9fb8b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 27 deletions.
23 changes: 11 additions & 12 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,19 +218,17 @@ def examine_date_elements(
"analyzing (HTML): %s",
" ".join(logstring(elem).split())[:100],
)
attempt = try_date_expr(
if attempt := try_date_expr(
text, outputformat, extensive_search, min_date, max_date
)
if attempt:
):
return attempt
# try link title (Blogspot)
title_attr = trim_text(elem.get("title", ""))
if len(title_attr) > MIN_SEGMENT_LEN:
title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_SEGMENT_LEN])
attempt = try_date_expr(
if attempt := try_date_expr(
title_attr, outputformat, extensive_search, min_date, max_date
)
if attempt:
):
return attempt

return None
Expand Down Expand Up @@ -404,8 +402,7 @@ def select_candidate(
years = [""] * len(bestones)
validation = [False] * len(bestones)
for i, pattern in enumerate(patterns):
year_match = yearpat.search(pattern)
if year_match:
if year_match := yearpat.search(pattern):
years[i] = year_match[1]
dateobject = datetime(int(year_match[1]), 1, 1)
validation[i] = date_validator(
Expand Down Expand Up @@ -642,7 +639,7 @@ def normalize_match(match: Optional[Match[str]]) -> str:
and optionally expand the year from two to four digits."""
day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr]
if len(year) == 2:
year = "19" + year if year[0] == "9" else "20" + year
year = f"19{year}" if year[0] == "9" else f"20{year}"
return f"{year}-{month}-{day}"


Expand Down Expand Up @@ -870,7 +867,7 @@ def search_page(
match = TWO_COMP_REGEX.match(item)
month = match[1] # type: ignore[index]
if len(month) == 1:
month = "0" + month
month = f"0{month}"
candidate = "-".join([match[2], month, "01"]) # type: ignore[index]
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
Expand Down Expand Up @@ -923,9 +920,11 @@ def search_page(
if bestmatch is not None:
dateobject = datetime(int(bestmatch[0]), 1, 1)
if (
date_validator(dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date)
date_validator(
dateobject, "%Y-%m-%d", earliest=min_date, latest=max_date
)
is True
and int(dateobject.year) >= copyear
and dateobject.year >= copyear
):
LOGGER.debug(
'date found for pattern "%s": %s', SIMPLE_PATTERN, bestmatch[0]
Expand Down
23 changes: 8 additions & 15 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Custom parsers and XPath expressions for date extraction
"""


## This file is available from https://github.com/adbar/htmldate
## under GNU GPL v3 license

Expand Down Expand Up @@ -85,7 +86,7 @@
# or contains(@class, 'article')
# or contains(@id, 'lastmod') or contains(@class, 'updated')

FREE_TEXT_EXPRESSIONS = FAST_PREPEND + "/text()"
FREE_TEXT_EXPRESSIONS = f"{FAST_PREPEND}/text()"
MIN_SEGMENT_LEN = 6
MAX_SEGMENT_LEN = 52

Expand Down Expand Up @@ -230,8 +231,7 @@ def extract_url_date(
testurl: str, outputformat: str, min_date: datetime, max_date: datetime
) -> Optional[str]:
"""Extract the date out of an URL string complying with the Y-M-D format"""
match = COMPLETE_URL.search(testurl)
if match:
if match := COMPLETE_URL.search(testurl):
LOGGER.debug("found date in URL: %s", match[0])
try:
dateobject = datetime(int(match[1]), int(match[2]), int(match[3]))
Expand Down Expand Up @@ -326,9 +326,7 @@ def custom_parse(
LOGGER.debug("parsing result: %s", candidate)
return candidate.strftime(outputformat)

# 2. Try YYYYMMDD, use regex
match = YMD_NO_SEP_PATTERN.search(string)
if match:
if match := YMD_NO_SEP_PATTERN.search(string):
try:
year, month, day = int(match[1][:4]), int(match[1][4:6]), int(match[1][6:8])
candidate = datetime(year, month, day)
Expand All @@ -344,9 +342,7 @@ def custom_parse(
LOGGER.debug("YYYYMMDD match: %s", candidate)
return candidate.strftime(outputformat)

# 3. Try the very common YMD, Y-M-D, and D-M-Y patterns
match = YMD_PATTERN.search(string)
if match:
if match := YMD_PATTERN.search(string):
try:
if match.lastgroup == "day":
year, month, day = (
Expand All @@ -373,9 +369,7 @@ def custom_parse(
LOGGER.debug("regex match: %s", candidate)
return candidate.strftime(outputformat)

# 4. Try the Y-M and M-Y patterns
match = YM_PATTERN.search(string)
if match:
if match := YM_PATTERN.search(string):
try:
if match.lastgroup == "month":
candidate = datetime(
Expand Down Expand Up @@ -524,14 +518,13 @@ def idiosyncrasies_search(
htmlstring: str, outputformat: str, min_date: datetime, max_date: datetime
) -> Optional[str]:
"""Look for author-written dates throughout the web page"""
match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR
if match:
if match := TEXT_PATTERNS.search(htmlstring):
parts = list(filter(None, match.groups()))
if len(parts) == 3:
candidate = None
if len(parts[0]) == 4:
candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2]))
elif len(parts[2]) in (2, 4):
elif len(parts[2]) in {2, 4}:
# DD/MM/YY
day, month = try_swap_values(int(parts[0]), int(parts[1]))
year = correct_year(int(parts[2]))
Expand Down

0 comments on commit 0c9fb8b

Please sign in to comment.