From d8944bf65777c0b3e6330f6809458aa38df7e1df Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 26 Oct 2023 12:49:34 +0200 Subject: [PATCH] extractors: more precise regex components (#106) * extractors: more precise regex components * simplify further * simplify using YEAR_RE --- htmldate/extractors.py | 50 +++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 12d1a219..41d8858c 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -96,15 +96,19 @@ # .//footer # .//*[(self::div or self::section)][@id="footer" or @class="footer"] +DAY_RE = "[0-3]?[0-9]" +MONTH_RE = "[0-1]?[0-9]" +YEAR_RE = "199[0-9]|20[0-3][0-9]" + # regex cache YMD_NO_SEP_PATTERN = re.compile(r"\b(\d{8})\b") YMD_PATTERN = re.compile( - r"(?:\D|^)(?:(?P\d{4})[\-/.](?P\d{1,2})[\-/.](?P\d{1,2})|" - r"(?P\d{1,2})[\-/.](?P\d{1,2})[\-/.](?P\d{2,4}))(?:\D|$)" + rf"(?:\D|^)(?:(?P{YEAR_RE})[\-/.](?P{MONTH_RE})[\-/.](?P{DAY_RE})|" + rf"(?P{DAY_RE})[\-/.](?P{MONTH_RE})[\-/.](?P\d{{2,4}}))(?:\D|$)" ) YM_PATTERN = re.compile( - r"(?:\D|^)(?:(?P\d{4})[\-/.](?P\d{1,2})|" - r"(?P\d{1,2})[\-/.](?P\d{4}))(?:\D|$)" + rf"(?:\D|^)(?:(?P{YEAR_RE})[\-/.](?P{MONTH_RE})|" + rf"(?P{MONTH_RE})[\-/.](?P{YEAR_RE}))(?:\D|$)" ) REGEX_MONTHS = """ @@ -118,20 +122,22 @@ """ # todo: check "août" LONG_TEXT_PATTERN = re.compile( rf"""(?P{REGEX_MONTHS})\s -(?P[0-9]{{1,2}})(?:st|nd|rd|th)?,? (?P[0-9]{{4}})| -(?P[0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )? -(?P{REGEX_MONTHS})[,.]? (?P[0-9]{{4}})""".replace( +(?P{DAY_RE})(?:st|nd|rd|th)?,? (?P{YEAR_RE})| +(?P{DAY_RE})(?:st|nd|rd|th|\.)? (?:of )? +(?P{REGEX_MONTHS})[,.]? (?P{YEAR_RE})""".replace( "\n", "" ), re.I, ) -COMPLETE_URL = re.compile(r"\D([0-9]{4})[/_-]([0-9]{1,2})[/_-]([0-9]{1,2})(?:\D|$)") +COMPLETE_URL = re.compile(rf"\D({YEAR_RE})[/_-]({MONTH_RE})[/_-]({DAY_RE})(?:\D|$)") -JSON_MODIFIED = re.compile(r'"dateModified": ?"([0-9]{4}-[0-9]{2}-[0-9]{2})', re.I) -JSON_PUBLISHED = re.compile(r'"datePublished": ?"([0-9]{4}-[0-9]{2}-[0-9]{2})', re.I) +JSON_MODIFIED = re.compile(rf'"dateModified": ?"({YEAR_RE}-{MONTH_RE}-{DAY_RE})', re.I) +JSON_PUBLISHED = re.compile( + rf'"datePublished": ?"({YEAR_RE}-{MONTH_RE}-{DAY_RE})', re.I +) TIMESTAMP_PATTERN = re.compile( - r"([0-9]{4}-[0-9]{2}-[0-9]{2}).[0-9]{2}:[0-9]{2}:[0-9]{2}" + rf"({YEAR_RE}-{MONTH_RE}-{DAY_RE}).[0-9]{{2}}:[0-9]{{2}}:[0-9]{{2}}" ) # English, French, German, Indonesian and Turkish dates cache @@ -178,37 +184,37 @@ ) # core patterns -THREE_COMP_REGEX_A = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{4})") +THREE_COMP_REGEX_A = re.compile(rf"({DAY_RE})[/.-]({MONTH_RE})[/.-]({YEAR_RE})") THREE_COMP_REGEX_B = re.compile( - r"([0-3]?[0-9])/([01]?[0-9])/([0-9]{2})|([0-3][0-9])[.-]([01][0-9])[.-]([0-9]{2})" + rf"({DAY_RE})/({MONTH_RE})/([0-9]{{2}})|({DAY_RE})[.-]({MONTH_RE})[.-]([0-9]{{2}})" ) -TWO_COMP_REGEX = re.compile(r"([0-3]?[0-9])[/.-]([0-9]{4})") +TWO_COMP_REGEX = re.compile(rf"({MONTH_RE})[/.-]({YEAR_RE})") # extensive search patterns -YEAR_PATTERN = re.compile(r"^\D?(199[0-9]|20[0-9]{2})") +YEAR_PATTERN = re.compile(rf"^\D?({YEAR_RE})") COPYRIGHT_PATTERN = re.compile( - r"(?:©|\©|Copyright|\(c\))\D*(?:[12][0-9]{3}-)?([12][0-9]{3})\D" + rf"(?:©|\©|Copyright|\(c\))\D*(?:{YEAR_RE}-)?({YEAR_RE})\D" ) THREE_PATTERN = re.compile(r"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]") THREE_CATCH = re.compile(r"([0-9]{4})/([0-9]{2})/([0-9]{2})") THREE_LOOSE_PATTERN = re.compile(r"\D([0-9]{4}[/.-][0-9]{2}[/.-][0-9]{2})\D") THREE_LOOSE_CATCH = re.compile(r"([0-9]{4})[/.-]([0-9]{2})[/.-]([0-9]{2})") SELECT_YMD_PATTERN = re.compile(r"\D([0-3]?[0-9][/.-][01]?[0-9][/.-][0-9]{4})\D") -SELECT_YMD_YEAR = re.compile(r"(19[0-9]{2}|20[0-9]{2})\D?$") -YMD_YEAR = re.compile(r"^([0-9]{4})") +SELECT_YMD_YEAR = re.compile(rf"({YEAR_RE})\D?$") +YMD_YEAR = re.compile(rf"^({YEAR_RE})") DATESTRINGS_PATTERN = re.compile( r"(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)" ) -DATESTRINGS_CATCH = re.compile(r"([12][0-9]{3})([01][0-9])([0-3][0-9])") +DATESTRINGS_CATCH = re.compile(rf"({YEAR_RE})([01][0-9])([0-3][0-9])") SLASHES_PATTERN = re.compile( r"\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\D" ) SLASHES_YEAR = re.compile(r"([0-9]{2})$") YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-][01][0-9])\D") -YYYYMM_CATCH = re.compile(r"([12][0-9]{3})[/.-]([01][0-9])") +YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-]([01][0-9])") MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D") -MMYYYY_YEAR = re.compile(r"([12][0-9]{3})\D?$") -SIMPLE_PATTERN = re.compile(r"(? Tuple[HtmlElement, List[HtmlElement]]: