Skip to content

Commit

Permalink
extractors: better discard regexes (#87)
Browse files Browse the repository at this point in the history
* extractors: better discard regexes

* refine discarding
  • Loading branch information
adbar authored Aug 25, 2023
1 parent a7548d7 commit db1dabc
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,22 +158,22 @@
}

TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")
NO_TEXT_DATE_PATTERN = re.compile(
r"\d{3,}\D+\d{3,}|\d{2}:\d{2}(:| )|\+\d{2}\D+|\D*\d{4}\D*$"
)
# leads to errors: \D+\d{3,}\D+


DISCARD_PATTERNS = re.compile(
r"[$€¥Ұ£¢₽₱฿#]|" # currency symbols
r"CNY|EUR|GBP|JPY|USD|" # currency codes
r"http|" # protocols
r"\.(com|net|org)|" # TLDs
r"IBAN|" # bank accountrs
r"\+\d{2}\b" # amounts/telephone numbers
r"^\d{2}:\d{2}(?: |:|$)|"
r"^\D*\d{4}\D*$|"
r"[$€¥Ұ£¢₽₱฿#₹]|" # currency symbols and special characters
r"[A-Z]{3}[^A-Z]|" # currency codes
r"(?:^|\D)(?:\+\d{2}|\d{3}|\d{5})\D|" # tel./IPs/postal codes
r"ftps?|https?|sftp|" # protocols
r"\.(com|net|org|info|gov|edu|de|fr|io)\b|" # TLDs
r"IBAN|[A-Z]{2}[0-9]{2}|" # bank accounts
r"®" # ©
)
# further testing required:
# \d[,.]\d+ # currency amounts
# \b\d{5}\s # postal codes
# leads to errors: ^\D+\d{3,}\D+

# use of regex module for speed?
TEXT_PATTERNS = re.compile(
Expand Down Expand Up @@ -461,7 +461,7 @@ def try_date_expr(
return None

# check if string only contains time/single year or digits and not a date
if NO_TEXT_DATE_PATTERN.match(string):
if DISCARD_PATTERNS.search(string):
return None

# try to parse using the faster method
Expand All @@ -472,7 +472,7 @@ def try_date_expr(
# use slow but extensive search
if extensive_search:
# additional filters to prevent computational cost
if not TEXT_DATE_PATTERN.search(string) or DISCARD_PATTERNS.search(string):
if not TEXT_DATE_PATTERN.search(string):
return None
# send to date parser
dateparser_result = external_date_parser(string, outputformat)
Expand Down

0 comments on commit db1dabc

Please sign in to comment.