From d3d138ea0e7e00797a818f47d5252b1e7ff031d2 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 7 Nov 2023 17:58:59 +0100 Subject: [PATCH] fix: account for special chars in normalization --- courlan/clean.py | 12 ++++++++++-- tests/unit_tests.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/courlan/clean.py b/courlan/clean.py index a2b25d5..b94fa92 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -148,6 +148,14 @@ def decode_punycode(string: str) -> str: return ".".join(parts) +def normalize_part(url_part: str) -> str: + """Normalize URLs parts (specifically path and fragment) while + accounting for certain characters.""" + if not "%" in url_part and not "!" in url_part: + url_part = quote(url_part) + return url_part + + def normalize_url( parsed_url: Union[SplitResult, str], strict: bool = False, @@ -166,12 +174,12 @@ def normalize_url( netloc = decode_punycode(netloc.lower()) # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py # leading /../'s in the path are removed - newpath = quote(PATH2.sub("", PATH1.sub("/", parsed_url.path))) + newpath = normalize_part(PATH2.sub("", PATH1.sub("/", parsed_url.path))) # strip unwanted query elements newquery = clean_query(parsed_url, strict, language) or "" if newquery and newpath == "": newpath = "/" # fragment - newfragment = "" if strict else quote(parsed_url.fragment) + newfragment = "" if strict else normalize_part(parsed_url.fragment) # rebuild return urlunsplit([scheme, netloc, newpath, newquery, newfragment]) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 8e8ab10..7a8cdd1 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -493,6 +493,17 @@ def test_normalization(): assert normalize_url("http://xn--Mnchen-3ya.de") == "http://münchen.de" assert normalize_url("http://Mnchen-3ya.de") == "http://mnchen-3ya.de" assert normalize_url("http://xn--München.de") == "http://xn--münchen.de" + # account for particular characters + assert ( + normalize_url( + "https://www.deutschlandfunknova.de/beitrag/nord--und-s%C3%BCdgaza-israels-armee-verk%C3%BCndet-teilung-des-gazastreifens" + ) + == "https://www.deutschlandfunknova.de/beitrag/nord--und-s%C3%BCdgaza-israels-armee-verk%C3%BCndet-teilung-des-gazastreifens" + ) + assert ( + normalize_url("https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/") + == "https://taz.de/Zukunft-des-49-Euro-Tickets/!5968518/" + ) def test_qelems():